23 files changed, 6001 insertions, 0 deletions
diff --git a/gfx/ycbcr/LICENSE b/gfx/ycbcr/LICENSE
new file mode 100644
index 0000000000..8dc35041de
--- /dev/null
+++ b/gfx/ycbcr/LICENSE
@@ -0,0 +1,27 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/gfx/ycbcr/README b/gfx/ycbcr/README
new file mode 100644
index 0000000000..8910a2a2b2
--- /dev/null
+++ b/gfx/ycbcr/README
@@ -0,0 +1,8 @@
+This color conversion code is from the Chromium open source project available here:
+
+http://code.google.com/chromium/
+
+The code comes from svn revision 63840 on 2010-10-26.
+
+It has been superseded upstream by libyuv (which is spawned off it). Bug 791941 is about
+trying to replace this code with libyuv.
diff --git a/gfx/ycbcr/YCbCrUtils.cpp b/gfx/ycbcr/YCbCrUtils.cpp
new file mode 100644
index 0000000000..92f8bc3f93
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.cpp
@@ -0,0 +1,357 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/EndianUtils.h"
+#include "gfx2DGlue.h"
+#include "mozilla/gfx/Swizzle.h"
+
+#include "YCbCrUtils.h"
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+
+namespace mozilla {
+namespace gfx {
+
+// clang-format off
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+                               SurfaceFormat& aSuggestedFormat,
+                               IntSize& aSuggestedSize)
+{
+  YUVType yuvtype =
+    TypeFromSize(aData.mYSize.width,
+                 aData.mYSize.height,
+                 aData.mCbCrSize.width,
+                 aData.mCbCrSize.height);
+
+  // 'prescale' is true if the scaling is to be done as part of the
+  // YCbCr to RGB conversion rather than on the RGB data when rendered.
+  bool prescale = aSuggestedSize.width > 0 && aSuggestedSize.height > 0 &&
+                  aSuggestedSize != aData.mPicSize;
+
+  if (aSuggestedFormat == SurfaceFormat::R5G6B5_UINT16) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+    if (prescale &&
+        !IsScaleYCbCrToRGB565Fast(aData.mPicX,
+                                  aData.mPicY,
+                                  aData.mPicSize.width,
+                                  aData.mPicSize.height,
+                                  aSuggestedSize.width,
+                                  aSuggestedSize.height,
+                                  yuvtype,
+                                  FILTER_BILINEAR) &&
+        IsConvertYCbCrToRGB565Fast(aData.mPicX,
+                                   aData.mPicY,
+                                   aData.mPicSize.width,
+                                   aData.mPicSize.height,
+                                   yuvtype)) {
+      prescale = false;
+    }
+#else
+    // yuv2rgb16 function not available
+    aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+#endif
+  }
+  else if (aSuggestedFormat != SurfaceFormat::B8G8R8X8) {
+    // No other formats are currently supported.
+    aSuggestedFormat = SurfaceFormat::B8G8R8X8;
+  }
+  if (aSuggestedFormat == SurfaceFormat::B8G8R8X8) {
+    /* ScaleYCbCrToRGB32 does not support a picture offset, nor 4:4:4 data.
+     See bugs 639415 and 640073. */
+    if (aData.mPicX != 0 || aData.mPicY != 0 || yuvtype == YV24)
+      prescale = false;
+  }
+  if (!prescale) {
+    aSuggestedSize = aData.mPicSize;
+  }
+}
+
+static inline void
+ConvertYCbCr16to8Line(uint8_t* aDst,
+                      int aStride,
+                      const uint16_t* aSrc,
+                      int aStride16,
+                      int aWidth,
+                      int aHeight,
+                      int aBitDepth)
+{
+  uint16_t mask = (1 << aBitDepth) - 1;
+
+  for (int i = 0; i < aHeight; i++) {
+    for (int j = 0; j < aWidth; j++) {
+      uint16_t val = (aSrc[j] & mask) >> (aBitDepth - 8);
+      aDst[j] = val;
+    }
+    aDst += aStride;
+    aSrc += aStride16;
+  }
+}
+
+void
+ConvertYCbCrToRGBInternal(const layers::PlanarYCbCrData& aData,
+                          const SurfaceFormat& aDestFormat,
+                          const IntSize& aDestSize,
+                          unsigned char* aDestBuffer,
+                          int32_t aStride)
+{
+  // ConvertYCbCrToRGB et al. assume the chroma planes are rounded up if the
+  // luma plane is odd sized. Monochrome images have 0-sized CbCr planes
+  MOZ_ASSERT(aData.mCbCrSize.width == aData.mYSize.width ||
+             aData.mCbCrSize.width == (aData.mYSize.width + 1) >> 1 ||
+             aData.mCbCrSize.width == 0);
+  MOZ_ASSERT(aData.mCbCrSize.height == aData.mYSize.height ||
+             aData.mCbCrSize.height == (aData.mYSize.height + 1) >> 1 ||
+             aData.mCbCrSize.height == 0);
+
+  // Used if converting to 8 bits YUV.
+  UniquePtr<uint8_t[]> yChannel;
+  UniquePtr<uint8_t[]> cbChannel;
+  UniquePtr<uint8_t[]> crChannel;
+  layers::PlanarYCbCrData dstData;
+  const layers::PlanarYCbCrData& srcData =
+    aData.mColorDepth == ColorDepth::COLOR_8 ? aData : dstData;
+
+  if (aData.mColorDepth != ColorDepth::COLOR_8) {
+    // Convert to 8 bits data first.
+    dstData.mPicSize = aData.mPicSize;
+    dstData.mPicX = aData.mPicX;
+    dstData.mPicY = aData.mPicY;
+    dstData.mYSize = aData.mYSize;
+    // We align the destination stride to 32 bytes, so that libyuv can use
+    // SSE optimised code.
+    dstData.mYStride = (aData.mYSize.width + 31) & ~31;
+    dstData.mCbCrSize = aData.mCbCrSize;
+    dstData.mCbCrStride = (aData.mCbCrSize.width + 31) & ~31;
+    dstData.mYUVColorSpace = aData.mYUVColorSpace;
+    dstData.mColorDepth = ColorDepth::COLOR_8;
+
+    size_t ySize = GetAlignedStride<1>(dstData.mYStride, aData.mYSize.height);
+    size_t cbcrSize =
+      GetAlignedStride<1>(dstData.mCbCrStride, aData.mCbCrSize.height);
+    if (ySize == 0) {
+      MOZ_DIAGNOSTIC_ASSERT(cbcrSize == 0, "CbCr without Y makes no sense");
+      return;
+    }
+    yChannel = MakeUnique<uint8_t[]>(ySize);
+
+    dstData.mYChannel = yChannel.get();
+
+    int bitDepth = BitDepthForColorDepth(aData.mColorDepth);
+
+    ConvertYCbCr16to8Line(dstData.mYChannel,
+                          dstData.mYStride,
+                          reinterpret_cast<uint16_t*>(aData.mYChannel),
+                          aData.mYStride / 2,
+                          aData.mYSize.width,
+                          aData.mYSize.height,
+                          bitDepth);
+
+    if (cbcrSize) {
+      cbChannel = MakeUnique<uint8_t[]>(cbcrSize);
+      crChannel = MakeUnique<uint8_t[]>(cbcrSize);
+
+      dstData.mCbChannel = cbChannel.get();
+      dstData.mCrChannel = crChannel.get();
+
+      ConvertYCbCr16to8Line(dstData.mCbChannel,
+                            dstData.mCbCrStride,
+                            reinterpret_cast<uint16_t*>(aData.mCbChannel),
+                            aData.mCbCrStride / 2,
+                            aData.mCbCrSize.width,
+                            aData.mCbCrSize.height,
+                            bitDepth);
+
+      ConvertYCbCr16to8Line(dstData.mCrChannel,
+                            dstData.mCbCrStride,
+                            reinterpret_cast<uint16_t*>(aData.mCrChannel),
+                            aData.mCbCrStride / 2,
+                            aData.mCbCrSize.width,
+                            aData.mCbCrSize.height,
+                            bitDepth);
+    }
+  }
+
+  YUVType yuvtype =
+    TypeFromSize(srcData.mYSize.width,
+                 srcData.mYSize.height,
+                 srcData.mCbCrSize.width,
+                 srcData.mCbCrSize.height);
+
+  // Convert from YCbCr to RGB now, scaling the image if needed.
+  if (aDestSize != srcData.mPicSize) {
+#if defined(HAVE_YCBCR_TO_RGB565)
+    if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+      ScaleYCbCrToRGB565(srcData.mYChannel,
+                         srcData.mCbChannel,
+                         srcData.mCrChannel,
+                         aDestBuffer,
+                         srcData.mPicX,
+                         srcData.mPicY,
+                         srcData.mPicSize.width,
+                         srcData.mPicSize.height,
+                         aDestSize.width,
+                         aDestSize.height,
+                         srcData.mYStride,
+                         srcData.mCbCrStride,
+                         aStride,
+                         yuvtype,
+                         FILTER_BILINEAR);
+    } else
+#endif
+      ScaleYCbCrToRGB32(srcData.mYChannel, //
+                        srcData.mCbChannel,
+                        srcData.mCrChannel,
+                        aDestBuffer,
+                        srcData.mPicSize.width,
+                        srcData.mPicSize.height,
+                        aDestSize.width,
+                        aDestSize.height,
+                        srcData.mYStride,
+                        srcData.mCbCrStride,
+                        aStride,
+                        yuvtype,
+                        srcData.mYUVColorSpace,
+                        FILTER_BILINEAR);
+  } else { // no prescale
+#if defined(HAVE_YCBCR_TO_RGB565)
+    if (aDestFormat == SurfaceFormat::R5G6B5_UINT16) {
+      ConvertYCbCrToRGB565(srcData.mYChannel,
+                           srcData.mCbChannel,
+                           srcData.mCrChannel,
+                           aDestBuffer,
+                           srcData.mPicX,
+                           srcData.mPicY,
+                           srcData.mPicSize.width,
+                           srcData.mPicSize.height,
+                           srcData.mYStride,
+                           srcData.mCbCrStride,
+                           aStride,
+                           yuvtype);
+    } else // aDestFormat != SurfaceFormat::R5G6B5_UINT16
+#endif
+      ConvertYCbCrToRGB32(srcData.mYChannel, //
+                          srcData.mCbChannel,
+                          srcData.mCrChannel,
+                          aDestBuffer,
+                          srcData.mPicX,
+                          srcData.mPicY,
+                          srcData.mPicSize.width,
+                          srcData.mPicSize.height,
+                          srcData.mYStride,
+                          srcData.mCbCrStride,
+                          aStride,
+                          yuvtype,
+                          srcData.mYUVColorSpace);
+  }
+}
+
+void ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+                       const SurfaceFormat& aDestFormat,
+                       const IntSize& aDestSize, unsigned char* aDestBuffer,
+                       int32_t aStride) {
+  ConvertYCbCrToRGBInternal(aData, aDestFormat, aDestSize, aDestBuffer,
+                            aStride);
+#if MOZ_BIG_ENDIAN()
+  // libyuv makes endian-correct result, which needs to be swapped to BGRX
+  if (aDestFormat != SurfaceFormat::R5G6B5_UINT16)
+    gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::X8R8G8B8,
+                     aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8X8,
+                     aData.mPicSize);
+#endif
+}
+
+void FillAlphaToRGBA(const uint8_t* aAlpha, const int32_t aAlphaStride,
+                     uint8_t* aBuffer, const int32_t aWidth,
+                     const int32_t aHeight, const gfx::SurfaceFormat& aFormat) {
+  MOZ_ASSERT(aAlphaStride >= aWidth);
+  MOZ_ASSERT(aFormat ==
+             SurfaceFormat::B8G8R8A8);  // required for SurfaceFormatBit::OS_A
+
+  const int bpp = BytesPerPixel(aFormat);
+  const size_t rgbaStride = aWidth * bpp;
+  const uint8_t* src = aAlpha;
+  for (int32_t h = 0; h < aHeight; ++h) {
+    size_t offset = static_cast<size_t>(SurfaceFormatBit::OS_A) / 8;
+    for (int32_t w = 0; w < aWidth; ++w) {
+      aBuffer[offset] = src[w];
+      offset += bpp;
+    }
+    src += aAlphaStride;
+    aBuffer += rgbaStride;
+  }
+}
+
+void ConvertYCbCrAToARGB(const layers::PlanarYCbCrAData& aData,
+                         const SurfaceFormat& aDestFormat,
+                         const IntSize& aDestSize, unsigned char* aDestBuffer,
+                         int32_t aStride) {
+  // libyuv makes endian-correct result, so the format needs to be B8G8R8A8.
+  MOZ_ASSERT(aDestFormat == SurfaceFormat::B8G8R8A8);
+
+  MOZ_ASSERT(aData.mAlphaSize == aData.mYSize);
+
+  YUVType yuvtype = TypeFromSize(aData.mYSize.width, aData.mYSize.height,
+                                 aData.mCbCrSize.width, aData.mCbCrSize.height);
+
+  if (yuvtype == YV12) {
+    // Currently this function only has support for I420 type
+    ConvertI420AlphaToARGB(aData.mYChannel, aData.mCbChannel, aData.mCrChannel,
+                           aData.mAlphaChannel, aData.mYStride,
+                           aData.mCbCrStride, aDestBuffer, aStride,
+                           aData.mPicSize.width, aData.mPicSize.height);
+    return;
+  }
+
+  // This function converts non-8-bpc images to 8-bpc. (Bug 1682322)
+  ConvertYCbCrToRGBInternal(aData, aDestFormat, aDestSize, aDestBuffer,
+                            aStride);
+
+  // Note alpha stride is equal to Y stride.
+  FillAlphaToRGBA(aData.mAlphaChannel, aData.mYStride, aDestBuffer,
+                  aData.mPicSize.width, aData.mPicSize.height, aDestFormat);
+
+  // Do preattenuate as what ConvertI420AlphaToARGB does
+  ARGBAttenuate(aDestBuffer, aStride, aDestBuffer, aStride,
+                aData.mPicSize.width, aData.mPicSize.height);
+
+#if MOZ_BIG_ENDIAN()
+  // libyuv makes endian-correct result, which needs to be swapped to BGRA
+  gfx::SwizzleData(aDestBuffer, aStride, gfx::SurfaceFormat::A8R8G8B8,
+                   aDestBuffer, aStride, gfx::SurfaceFormat::B8G8R8A8,
+                   aData.mPicSize);
+#endif
+}
+
+void
+ConvertI420AlphaToARGB(const uint8_t* aSrcY,
+                       const uint8_t* aSrcU,
+                       const uint8_t* aSrcV,
+                       const uint8_t* aSrcA,
+                       int aSrcStrideYA, int aSrcStrideUV,
+                       uint8_t* aDstARGB, int aDstStrideARGB,
+                       int aWidth, int aHeight) {
+
+  ConvertI420AlphaToARGB32(aSrcY,
+                           aSrcU,
+                           aSrcV,
+                           aSrcA,
+                           aDstARGB,
+                           aWidth,
+                           aHeight,
+                           aSrcStrideYA,
+                           aSrcStrideUV,
+                           aDstStrideARGB);
+#if MOZ_BIG_ENDIAN()
+  // libyuv makes endian-correct result, which needs to be swapped to BGRA
+  gfx::SwizzleData(aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::A8R8G8B8,
+                   aDstARGB, aDstStrideARGB, gfx::SurfaceFormat::B8G8R8A8,
+                   IntSize(aWidth, aHeight));
+#endif
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/YCbCrUtils.h b/gfx/ycbcr/YCbCrUtils.h
new file mode 100644
index 0000000000..8a3534a914
--- /dev/null
+++ b/gfx/ycbcr/YCbCrUtils.h
@@ -0,0 +1,45 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef Y_CB_CR_UTILS_H_
+#define Y_CB_CR_UTILS_H_
+
+#include "mozilla/gfx/Types.h"
+#include "ImageContainer.h"
+
+namespace mozilla {
+namespace gfx {
+
+void
+GetYCbCrToRGBDestFormatAndSize(const layers::PlanarYCbCrData& aData,
+                               SurfaceFormat& aSuggestedFormat,
+                               IntSize& aSuggestedSize);
+
+void
+ConvertYCbCrToRGB(const layers::PlanarYCbCrData& aData,
+                  const SurfaceFormat& aDestFormat,
+                  const IntSize& aDestSize,
+                  unsigned char* aDestBuffer,
+                  int32_t aStride);
+
+void ConvertYCbCrAToARGB(const layers::PlanarYCbCrAData& aData,
+                         const SurfaceFormat& aDestFormat,
+                         const IntSize& aDestSize,
+                         unsigned char* aDestBuffer,
+                         int32_t aStride);
+
+void
+ConvertI420AlphaToARGB(const uint8_t* aSrcY,
+                       const uint8_t* aSrcU,
+                       const uint8_t* aSrcV,
+                       const uint8_t* aSrcA,
+                       int aSrcStrideYA, int aSrcStrideUV,
+                       uint8_t* aDstARGB, int aDstStrideARGB,
+                       int aWidth, int aHeight);
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif /* Y_CB_CR_UTILS_H_ */
diff --git a/gfx/ycbcr/chromium_types.h b/gfx/ycbcr/chromium_types.h
new file mode 100644
index 0000000000..13f92975b5
--- /dev/null
+++ b/gfx/ycbcr/chromium_types.h
@@ -0,0 +1,50 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef GFX_CHROMIUMTYPES_H
+#define GFX_CHROMIUMTYPES_H
+
+#include <stdint.h>
+
+#include "libyuv/basic_types.h"
+
+// From Chromium build_config.h:
+// Processor architecture detection.  For more info on what's defined, see:
+//   http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+//   http://www.agner.org/optimize/calling_conventions.pdf
+//   or with gcc, run: "echo | gcc -E -dM -"
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_64 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_32 1
+#define ARCH_CPU_X86 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARMEL 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__ppc__) || defined(__powerpc) || defined(__PPC__)
+#define ARCH_CPU_PPC_FAMILY 1
+#define ARCH_CPU_PPC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparc)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_32_BITS 1
+#elif defined(__sparcv9)
+#define ARCH_CPU_SPARC_FAMILY 1
+#define ARCH_CPU_SPARC 1
+#define ARCH_CPU_64_BITS 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define ARCH_CPU_AARCH64_FAMILY 1
+#define ARCH_CPU_AARCH64 1
+#define ARCH_CPU_64_BITS 1
+#else
+#warning Please add support for your architecture in chromium_types.h
+#endif
+
+#endif // GFX_CHROMIUMTYPES_H
diff --git a/gfx/ycbcr/moz.build b/gfx/ycbcr/moz.build
new file mode 100644
index 0000000000..c643fbaf40
--- /dev/null
+++ b/gfx/ycbcr/moz.build
@@ -0,0 +1,66 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS += [
+    'YCbCrUtils.h',
+]
+
+UNIFIED_SOURCES += [
+    'scale_yuv_argb.cpp',
+    'ycbcr_to_rgb565.cpp',
+    'YCbCrUtils.cpp',
+    'yuv_convert.cpp',
+    'yuv_row_c.cpp',
+    'yuv_row_table.cpp',
+]
+
+if CONFIG['INTEL_ARCHITECTURE']:
+    # These files use MMX and SSE2 intrinsics, so they need special compile flags
+    # on some compilers.
+    SOURCES += ['yuv_convert_sse2.cpp']
+    SOURCES['yuv_convert_sse2.cpp'].flags += CONFIG['SSE2_FLAGS']
+
+    # MSVC doesn't support MMX when targeting AMD64.
+    if CONFIG['CC_TYPE'] == 'clang-cl':
+        if CONFIG['CPU_ARCH'] == 'x86':
+            SOURCES += [
+                'yuv_convert_mmx.cpp',
+            ]
+    else:
+        SOURCES += ['yuv_convert_mmx.cpp']
+        SOURCES['yuv_convert_mmx.cpp'].flags += CONFIG['MMX_FLAGS']
+
+if CONFIG['CC_TYPE'] == 'clang-cl':
+    if CONFIG['CPU_ARCH'] == 'x86_64' or \
+       (CONFIG['CPU_ARCH'] == 'x86' and CONFIG['CC_TYPE'] == 'clang-cl'):
+        SOURCES += [
+            'yuv_row_win64.cpp',
+        ]
+    else:
+        SOURCES += [
+            'yuv_row_win.cpp',
+        ]
+elif CONFIG['OS_ARCH'] in ('Linux', 'SunOS', 'Darwin', 'DragonFly',
+                           'FreeBSD', 'NetBSD', 'OpenBSD'):
+    SOURCES += [
+        'yuv_row_posix.cpp',
+    ]
+else:
+    SOURCES += [
+        'yuv_row_other.cpp',
+    ]
+
+if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['HAVE_ARM_NEON']:
+    SOURCES += [
+        'yuv_row_arm.s',
+    ]
+    SOURCES += [
+        'yuv_convert_arm.cpp',
+    ]
+
+LOCAL_INCLUDES += ['/media/libyuv/libyuv/include']
+
+FINAL_LIBRARY = 'xul'
diff --git a/gfx/ycbcr/scale_yuv_argb.cpp b/gfx/ycbcr/scale_yuv_argb.cpp
new file mode 100644
index 0000000000..74bbb5e606
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.cpp
@@ -0,0 +1,1133 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *  Copyright 2016 Mozilla Foundation
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// YUV to RGB conversion and scaling functions were implemented by referencing
+// scale_argb.cc
+//
+// libyuv already has ScaleYUVToARGBBilinearUp(), but its implementation is not
+// completed yet. Implementations of the functions are based on it.
+// At first, ScaleYUVToARGBBilinearUp() was implemented by modidying the
+// libyuv's one. Then all another functions were implemented similarly.
+//
+// Function relationship between yuv_convert.cpp abd scale_argb.cc are like
+// the followings
+//  - ScaleYUVToARGBDown2()      <-- ScaleARGBDown2()
+//  - ScaleYUVToARGBDownEven()   <-- ScaleARGBDownEven()
+//  - ScaleYUVToARGBBilinearDown() <-- ScaleARGBBilinearDown()
+//  - ScaleYUVToARGBBilinearUp() <-- ScaleARGBBilinearUp() and ScaleYUVToARGBBilinearUp() in libyuv
+//  - ScaleYUVToARGBSimple()     <-- ScaleARGBSimple()
+//  - ScaleYUVToARGB()           <-- ScaleARGB() // Removed some function calls for simplicity.
+//  - YUVToARGBScale()           <-- ARGBScale()
+//
+// Callings and selections of InterpolateRow() and ScaleARGBFilterCols() were
+// kept as same as possible.
+//
+// The followings changes were done to each scaling functions.
+//
+// -[1] Allocate YUV conversion buffer and use it as source buffer of scaling.
+//      Its usage is borrowed from the libyuv's ScaleYUVToARGBBilinearUp().
+// -[2] Conversion from YUV to RGB was abstracted as YUVBuferIter.
+//      It is for handling multiple yuv color formats.
+// -[3] Modified scaling functions as to handle YUV conversion buffer and
+//      use YUVBuferIter.
+// -[4] Color conversion function selections in YUVBuferIter were borrowed from
+//      I444ToARGBMatrix(), I422ToARGBMatrix() and I420ToARGBMatrix()
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+typedef mozilla::gfx::YUVColorSpace YUVColorSpace;
+
+struct YUVBuferIter {
+  int src_width;
+  int src_height;
+  int src_stride_y;
+  int src_stride_u;
+  int src_stride_v;
+  const uint8* src_y;
+  const uint8* src_u;
+  const uint8* src_v;
+
+  uint32 src_fourcc;
+  const struct YuvConstants* yuvconstants;
+  int y_index;
+  const uint8* src_row_y;
+  const uint8* src_row_u;
+  const uint8* src_row_v;
+
+  void (*YUVToARGBRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+  void (*MoveTo)(YUVBuferIter& iter, int y_index);
+  void (*MoveToNextRow)(YUVBuferIter& iter);
+};
+
+void YUVBuferIter_InitI422(YUVBuferIter& iter) {
+  iter.YUVToARGBRow = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    iter.YUVToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(iter.src_width, 8)) {
+      iter.YUVToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    iter.YUVToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(iter.src_width, 16)) {
+      iter.YUVToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    iter.YUVToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(iter.src_width, 8)) {
+      iter.YUVToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(iter.src_width, 4) &&
+      IS_ALIGNED(iter.src_y, 4) && IS_ALIGNED(iter.src_stride_y, 4) &&
+      IS_ALIGNED(iter.src_u, 2) && IS_ALIGNED(iter.src_stride_u, 2) &&
+      IS_ALIGNED(iter.src_v, 2) && IS_ALIGNED(iter.src_stride_v, 2) {
+    // Always satisfy IS_ALIGNED(argb_cnv_row, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)
+    iter.YUVToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+}
+
+void YUVBuferIter_InitI444(YUVBuferIter& iter) {
+  iter.YUVToARGBRow = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    iter.YUVToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(iter.src_width, 8)) {
+      iter.YUVToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    iter.YUVToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(iter.src_width, 16)) {
+      iter.YUVToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    iter.YUVToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(iter.src_width, 8)) {
+      iter.YUVToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+}
+
+
+static void YUVBuferIter_MoveToForI444(YUVBuferIter& iter, int y_index) {
+  iter.y_index = y_index;
+  iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+  iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+  iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI444(YUVBuferIter& iter) {
+  iter.src_row_y += iter.src_stride_y;
+  iter.src_row_u += iter.src_stride_u;
+  iter.src_row_v += iter.src_stride_v;
+  iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI422(YUVBuferIter& iter, int y_index) {
+  iter.y_index = y_index;
+  iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+  iter.src_row_u = iter.src_u + y_index * iter.src_stride_u;
+  iter.src_row_v = iter.src_v + y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI422(YUVBuferIter& iter) {
+  iter.src_row_y += iter.src_stride_y;
+  iter.src_row_u += iter.src_stride_u;
+  iter.src_row_v += iter.src_stride_v;
+  iter.y_index++;
+}
+
+static void YUVBuferIter_MoveToForI420(YUVBuferIter& iter, int y_index) {
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int uv_y_index = y_index >> kYShift;
+
+  iter.y_index = y_index;
+  iter.src_row_y = iter.src_y + y_index * iter.src_stride_y;
+  iter.src_row_u = iter.src_u + uv_y_index * iter.src_stride_u;
+  iter.src_row_v = iter.src_v + uv_y_index * iter.src_stride_v;
+}
+
+static void YUVBuferIter_MoveToNextRowForI420(YUVBuferIter& iter) {
+  iter.src_row_y += iter.src_stride_y;
+  if (iter.y_index & 1) {
+    iter.src_row_u += iter.src_stride_u;
+    iter.src_row_v += iter.src_stride_v;
+  }
+  iter.y_index++;
+}
+
+static __inline void YUVBuferIter_ConvertToARGBRow(YUVBuferIter& iter, uint8* argb_row) {
+  iter.YUVToARGBRow(iter.src_row_y, iter.src_row_u, iter.src_row_v, argb_row, iter.yuvconstants, iter.src_width);
+}
+
+void YUVBuferIter_Init(YUVBuferIter& iter, uint32 src_fourcc, YUVColorSpace yuv_color_space) {
+  iter.src_fourcc = src_fourcc;
+  iter.y_index = 0;
+  iter.src_row_y = iter.src_y;
+  iter.src_row_u = iter.src_u;
+  iter.src_row_v = iter.src_v;
+  switch (yuv_color_space) {
+    case YUVColorSpace::BT2020:
+      iter.yuvconstants = &kYuv2020Constants;
+      break;
+    case YUVColorSpace::BT709:
+      iter.yuvconstants = &kYuvH709Constants;
+      break;
+    default:
+      iter.yuvconstants = &kYuvI601Constants;
+  }
+
+  if (src_fourcc == FOURCC_I444) {
+    YUVBuferIter_InitI444(iter);
+    iter.MoveTo = YUVBuferIter_MoveToForI444;
+    iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI444;
+  } else if(src_fourcc == FOURCC_I422){
+    YUVBuferIter_InitI422(iter);
+    iter.MoveTo = YUVBuferIter_MoveToForI422;
+    iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI422;
+  } else {
+    assert(src_fourcc == FOURCC_I420); // Should be FOURCC_I420
+    YUVBuferIter_InitI422(iter);
+    iter.MoveTo = YUVBuferIter_MoveToForI420;
+    iter.MoveToNextRow = YUVBuferIter_MoveToNextRowForI420;
+  }
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleYUVToARGBDown2(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride_y,
+                                int src_stride_u,
+                                int src_stride_v,
+                                int dst_stride_argb,
+                                const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering,
+                                uint32 src_fourcc,
+                                YUVColorSpace yuv_color_space) {
+  int j;
+
+  // Allocate 2 rows of ARGB for source conversion.
+  const int kRowSize = (src_width * 4 + 15) & ~15;
+  align_buffer_64(argb_cnv_row, kRowSize * 2);
+  uint8* argb_cnv_rowptr = argb_cnv_row;
+  int argb_cnv_rowstride = kRowSize;
+
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  int yi = y >> 16;
+  iter.MoveTo(iter, yi);
+  ptrdiff_t x_offset;
+  if (filtering == kFilterBilinear) {
+    x_offset = (x >> 16) * 4;
+  } else {
+    x_offset = ((x >> 16) - 1) * 4;
+  }
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+
+  const int dyi = dy >> 16;
+  int lastyi = yi;
+  YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+  // Prepare next row if necessary
+  if (filtering != kFilterLinear) {
+    if ((yi + dyi) < (src_height - 1)) {
+      iter.MoveTo(iter, yi + dyi);
+      YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+    } else {
+      argb_cnv_rowstride = 0;
+    }
+  }
+
+  if (filtering == kFilterLinear) {
+    argb_cnv_rowstride = 0;
+  }
+  const int max_yi = src_height - 1;
+  const int max_yi_minus_dyi = max_yi - dyi;
+  for (j = 0; j < dst_height; ++j) {
+    if (yi != lastyi) {
+      if (yi > max_yi) {
+        yi = max_yi;
+      }
+      if (yi != lastyi) {
+        if (filtering == kFilterLinear) {
+          iter.MoveTo(iter, yi);
+          YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          lastyi = yi;
+        } else {
+          // Prepare current row
+          if (yi == iter.y_index) {
+            argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+            argb_cnv_rowstride = - argb_cnv_rowstride;
+          } else {
+            iter.MoveTo(iter, yi);
+            argb_cnv_rowptr = argb_cnv_row;
+            argb_cnv_rowstride = kRowSize;
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          }
+          // Prepare next row if necessary
+          if (iter.y_index  < max_yi) {
+            int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+            iter.MoveTo(iter, next_yi);
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+          } else {
+            argb_cnv_rowstride = 0;
+          }
+          lastyi = yi;
+        }
+      }
+    }
+    ScaleARGBRowDown2(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, dst_argb, dst_width);
+    dst_argb += dst_stride_argb;
+    yi += dyi;
+  }
+
+  free_aligned_buffer_64(argb_cnv_row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleYUVToARGBDownEven(int src_width, int src_height,
+                                   int dst_width, int dst_height,
+                                   int src_stride_y,
+                                   int src_stride_u,
+                                   int src_stride_v,
+                                   int dst_stride_argb,
+                                   const uint8* src_y,
+                                   const uint8* src_u,
+                                   const uint8* src_v,
+                                   uint8* dst_argb,
+                                   int x, int dx, int y, int dy,
+                                   enum FilterMode filtering,
+                                   uint32 src_fourcc,
+                                   YUVColorSpace yuv_color_space) {
+  int j;
+  // Allocate 2 rows of ARGB for source conversion.
+  const int kRowSize = (src_width * 4 + 15) & ~15;
+  align_buffer_64(argb_cnv_row, kRowSize * 2);
+  uint8* argb_cnv_rowptr = argb_cnv_row;
+  int argb_cnv_rowstride = kRowSize;
+
+  int col_step = dx >> 16;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  int yi = y >> 16;
+  const ptrdiff_t x_offset = (x >> 16) * 4;
+
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+  const int dyi = dy >> 16;
+  int lastyi = yi;
+  YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+  // Prepare next row if necessary
+  if (filtering != kFilterLinear) {
+    if ((yi + dyi) < (src_height - 1)) {
+      iter.MoveTo(iter, yi + dyi);
+      YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+    } else {
+      argb_cnv_rowstride = 0;
+    }
+  }
+
+  if (filtering == kFilterLinear) {
+    argb_cnv_rowstride = 0;
+  }
+  const int max_yi = src_height - 1;
+  const int max_yi_minus_dyi = max_yi - dyi;
+  for (j = 0; j < dst_height; ++j) {
+    if (yi != lastyi) {
+      if (yi > max_yi) {
+        yi = max_yi;
+      }
+      if (yi != lastyi) {
+        if (filtering == kFilterLinear) {
+          iter.MoveTo(iter, yi);
+          YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          lastyi = yi;
+        } else {
+          // Prepare current row
+          if (yi == iter.y_index) {
+            argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+            argb_cnv_rowstride = - argb_cnv_rowstride;
+          } else {
+            iter.MoveTo(iter, yi);
+            argb_cnv_rowptr = argb_cnv_row;
+            argb_cnv_rowstride = kRowSize;
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          }
+          // Prepare next row if necessary
+          if (iter.y_index  < max_yi) {
+            int next_yi = yi < max_yi_minus_dyi ? yi + dyi : max_yi;
+            iter.MoveTo(iter, next_yi);
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+          } else {
+            argb_cnv_rowstride = 0;
+          }
+          lastyi = yi;
+        }
+      }
+    }
+    ScaleARGBRowDownEven(argb_cnv_rowptr + x_offset, argb_cnv_rowstride, col_step, dst_argb, dst_width);
+    dst_argb += dst_stride_argb;
+    yi += dyi;
+  }
+  free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB down with bilinear interpolation.
+static void ScaleYUVToARGBBilinearDown(int src_width, int src_height,
+                                       int dst_width, int dst_height,
+                                       int src_stride_y,
+                                       int src_stride_u,
+                                       int src_stride_v,
+                                       int dst_stride_argb,
+                                       const uint8* src_y,
+                                       const uint8* src_u,
+                                       const uint8* src_v,
+                                       uint8* dst_argb,
+                                       int x, int dx, int y, int dy,
+                                       enum FilterMode filtering,
+                                       uint32 src_fourcc,
+                                       YUVColorSpace yuv_color_space) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  const ptrdiff_t xl_offset = xl * 4;
+  x -= (int)(xl << 16);
+
+  // Allocate 2 row of ARGB for source conversion.
+  const int kRowSize = (src_width * 4 + 15) & ~15;
+  align_buffer_64(argb_cnv_row, kRowSize * 2);
+  uint8* argb_cnv_rowptr = argb_cnv_row;
+  int argb_cnv_rowstride = kRowSize;
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(argb_cnv_rowstride, 4)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+
+  int yi = y >> 16;
+
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+  iter.MoveTo(iter, yi);
+
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  align_buffer_64(row, clip_src_width * 4);
+
+  int lastyi = yi;
+  YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+  // Prepare next row if necessary
+  if (filtering != kFilterLinear) {
+    if ((yi + 1) < src_height) {
+      iter.MoveToNextRow(iter);
+      YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+    } else {
+      argb_cnv_rowstride = 0;
+    }
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  const int max_yi = src_height - 1;
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lastyi) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+      }
+      if (yi != lastyi) {
+        if (filtering == kFilterLinear) {
+          iter.MoveTo(iter, yi);
+          YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          lastyi = yi;
+        } else {
+          // Prepare current row
+          if (yi == iter.y_index) {
+            argb_cnv_rowptr = argb_cnv_rowptr + argb_cnv_rowstride;
+            argb_cnv_rowstride = - argb_cnv_rowstride;
+          } else {
+            iter.MoveTo(iter, yi);
+            argb_cnv_rowptr = argb_cnv_row;
+            argb_cnv_rowstride = kRowSize;
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr);
+          }
+          // Prepare next row if necessary
+          if (iter.y_index < max_yi) {
+            iter.MoveToNextRow(iter);
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_rowptr + argb_cnv_rowstride);
+          } else {
+            argb_cnv_rowstride = 0;
+          }
+          lastyi = yi;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      ScaleARGBFilterCols(dst_argb, argb_cnv_rowptr + xl_offset, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, argb_cnv_rowptr + xl_offset, argb_cnv_rowstride, clip_src_width, yf);
+      ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering,
+                                     uint32 src_fourcc,
+                                     YUVColorSpace yuv_color_space) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  int yi = y >> 16;
+
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+  iter.MoveTo(iter, yi);
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lastyi = yi;
+
+  YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+  ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+
+  if (filtering == kFilterLinear) {
+    rowstride = 0;
+  }
+  // Prepare next row if necessary
+  if (filtering != kFilterLinear) {
+    if ((yi + 1) < src_height) {
+      iter.MoveToNextRow(iter);
+      YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+      ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+    }else {
+      rowstride = 0;
+    }
+  }
+
+  const int max_yi = src_height - 1;
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lastyi) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+      }
+      if (yi != lastyi) {
+        if (filtering == kFilterLinear) {
+            iter.MoveToNextRow(iter);
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+            ScaleARGBFilterCols(rowptr, argb_cnv_row, dst_width, x, dx);
+        } else {
+          // Prepare next row if necessary
+          if (yi < max_yi) {
+            iter.MoveToNextRow(iter);
+            rowptr += rowstride;
+            rowstride = -rowstride;
+            // TODO(fbarchard): Convert the clipped region of row.
+            YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+            ScaleARGBFilterCols(rowptr + rowstride, argb_cnv_row, dst_width, x, dx);
+          } else {
+            rowstride = 0;
+          }
+        }
+        lastyi = yi;
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(argb_cnv_row);
+}
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleYUVToARGBSimple(int src_width, int src_height,
+                                 int dst_width, int dst_height,
+                                 int src_stride_y,
+                                 int src_stride_u,
+                                 int src_stride_v,
+                                 int dst_stride_argb,
+                                 const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_argb,
+                                 int x, int dx, int y, int dy,
+                                 uint32 src_fourcc,
+                                 YUVColorSpace yuv_color_space) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_cnv_row, src_width * 4);
+
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  int yi = y >> 16;
+
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+  iter.MoveTo(iter, yi);
+
+  int lasty = yi;
+  YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      iter.MoveTo(iter, yi);
+      YUVBuferIter_ConvertToARGBRow(iter, argb_cnv_row);
+      lasty = yi;
+    }
+    ScaleARGBCols(dst_argb, argb_cnv_row, dst_width, x, dx);
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(argb_cnv_row);
+}
+
+static void YUVToARGBCopy(const uint8* src_y, int src_stride_y,
+                          const uint8* src_u, int src_stride_u,
+                          const uint8* src_v, int src_stride_v,
+                          int src_width, int src_height,
+                          uint8* dst_argb, int dst_stride_argb,
+                          int dst_width, int dst_height,
+                          uint32 src_fourcc,
+                          YUVColorSpace yuv_color_space)
+{
+  YUVBuferIter iter;
+  iter.src_width = src_width;
+  iter.src_height = src_height;
+  iter.src_stride_y = src_stride_y;
+  iter.src_stride_u = src_stride_u;
+  iter.src_stride_v = src_stride_v;
+  iter.src_y = src_y;
+  iter.src_u = src_u;
+  iter.src_v = src_v;
+  YUVBuferIter_Init(iter, src_fourcc, yuv_color_space);
+
+  for (int j = 0; j < dst_height; ++j) {
+    YUVBuferIter_ConvertToARGBRow(iter, dst_argb);
+    iter.MoveToNextRow(iter);
+    dst_argb += dst_stride_argb;
+  }
+}
+
+static void ScaleYUVToARGB(const uint8* src_y, int src_stride_y,
+                           const uint8* src_u, int src_stride_u,
+                           const uint8* src_v, int src_stride_v,
+                           int src_width, int src_height,
+                           uint8* dst_argb, int dst_stride_argb,
+                           int dst_width, int dst_height,
+                           enum FilterMode filtering,
+                           uint32 src_fourcc,
+                           YUVColorSpace yuv_color_space)
+{
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleYUVToARGBDown2(src_width, src_height,
+                              dst_width, dst_height,
+                              src_stride_y,
+                              src_stride_u,
+                              src_stride_v,
+                              dst_stride_argb,
+                              src_y,
+                              src_u,
+                              src_v,
+                              dst_argb,
+                              x, dx, y, dy,
+                              filtering,
+                              src_fourcc,
+                              yuv_color_space);
+          return;
+        }
+        ScaleYUVToARGBDownEven(src_width, src_height,
+                               dst_width, dst_height,
+                               src_stride_y,
+                               src_stride_u,
+                               src_stride_v,
+                               dst_stride_argb,
+                               src_y,
+                               src_u,
+                               src_v,
+                               dst_argb,
+                               x, dx, y, dy,
+                               filtering,
+                               src_fourcc,
+                               yuv_color_space);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight conversion and copy.
+          YUVToARGBCopy(src_y, src_stride_y,
+                        src_u, src_stride_u,
+                        src_v, src_stride_v,
+                        src_width, src_height,
+                        dst_argb, dst_stride_argb,
+                        dst_width, dst_height,
+                        src_fourcc,
+                        yuv_color_space);
+          return;
+        }
+      }
+    }
+  }
+  if (filtering && dy < 65536) {
+    ScaleYUVToARGBBilinearUp(src_width, src_height,
+                             dst_width, dst_height,
+                             src_stride_y,
+                             src_stride_u,
+                             src_stride_v,
+                             dst_stride_argb,
+                             src_y,
+                             src_u,
+                             src_v,
+                             dst_argb,
+                             x, dx, y, dy,
+                             filtering,
+                             src_fourcc,
+                             yuv_color_space);
+    return;
+  }
+  if (filtering) {
+    ScaleYUVToARGBBilinearDown(src_width, src_height,
+                               dst_width, dst_height,
+                               src_stride_y,
+                               src_stride_u,
+                               src_stride_v,
+                               dst_stride_argb,
+                               src_y,
+                               src_u,
+                               src_v,
+                               dst_argb,
+                               x, dx, y, dy,
+                               filtering,
+                               src_fourcc,
+                               yuv_color_space);
+    return;
+  }
+  ScaleYUVToARGBSimple(src_width, src_height,
+                       dst_width, dst_height,
+                       src_stride_y,
+                       src_stride_u,
+                       src_stride_v,
+                       dst_stride_argb,
+                       src_y,
+                       src_u,
+                       src_v,
+                       dst_argb,
+                       x, dx, y, dy,
+                       src_fourcc,
+                       yuv_color_space);
+}
+
+bool IsConvertSupported(uint32 src_fourcc)
+{
+  if (src_fourcc == FOURCC_I444 ||
+      src_fourcc == FOURCC_I422 ||
+      src_fourcc == FOURCC_I420) {
+    return true;
+  }
+  return false;
+}
+
+LIBYUV_API
+int YUVToARGBScale(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint32 src_fourcc,
+                   YUVColorSpace yuv_color_space,
+                   int src_width, int src_height,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering)
+{
+  if (!src_y || !src_u || !src_v ||
+      src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  if (!IsConvertSupported(src_fourcc)) {
+    return -1;
+  }
+  ScaleYUVToARGB(src_y, src_stride_y,
+                 src_u, src_stride_u,
+                 src_v, src_stride_v,
+                 src_width, src_height,
+                 dst_argb, dst_stride_argb,
+                 dst_width, dst_height,
+                 filtering,
+                 src_fourcc,
+                 yuv_color_space);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/gfx/ycbcr/scale_yuv_argb.h b/gfx/ycbcr/scale_yuv_argb.h
new file mode 100644
index 0000000000..50d21c22fa
--- /dev/null
+++ b/gfx/ycbcr/scale_yuv_argb.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#include "mozilla/gfx/Types.h" // For YUVColorSpace
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+int YUVToARGBScale(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint32 src_fourcc,
+                   mozilla::gfx::YUVColorSpace yuv_color_space,
+                   int src_width, int src_height,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_YUV_ARGB_H_  NOLINT
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.cpp b/gfx/ycbcr/ycbcr_to_rgb565.cpp
new file mode 100644
index 0000000000..0572e3e094
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp
@@ -0,0 +1,672 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdlib.h>
+#include <limits.h>
+#include "nsDebug.h"
+#include "ycbcr_to_rgb565.h"
+#include "nsAlgorithm.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+/*This contains all of the parameters that are needed to convert a row.
+  Passing them in a struct instead of as individual parameters saves the need
+   to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_bilinear_ctx{
+  uint16_t *rgb_row;
+  const uint8_t *y_row;
+  const uint8_t *u_row;
+  const uint8_t *v_row;
+  int y_yweight;
+  int y_pitch;
+  int width;
+  int source_x0_q16;
+  int source_dx_q16;
+  /*Not used for 4:4:4, except with chroma-nearest.*/
+  int source_uv_xoffs_q16;
+  /*Not used for 4:4:4 or chroma-nearest.*/
+  int uv_pitch;
+  /*Not used for 4:2:2, 4:4:4, or chroma-nearest.*/
+  int uv_yweight;
+};
+
+
+
+/*This contains all of the parameters that are needed to convert a row.
+  Passing them in a struct instead of as individual parameters saves the need
+   to continually push onto the stack the ones that are fixed for every row.*/
+struct yuv2rgb565_row_scale_nearest_ctx{
+  uint16_t *rgb_row;
+  const uint8_t *y_row;
+  const uint8_t *u_row;
+  const uint8_t *v_row;
+  int width;
+  int source_x0_q16;
+  int source_dx_q16;
+  /*Not used for 4:4:4.*/
+  int source_uv_xoffs_q16;
+};
+
+
+
+typedef void (*yuv2rgb565_row_scale_bilinear_func)(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+typedef void (*yuv2rgb565_row_scale_nearest_func)(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither);
+
+
+
+//TODO: fix NEON asm for iOS
+# if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+
+extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
+void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16 *dst,
+                                                       const uint8 *y,
+                                                       const uint8 *u,
+                                                       const uint8 *v,
+                                                       int n,
+                                                       int oddflag);
+
+#endif
+
+
+
+/*Bilinear interpolation of a single value.
+  This uses the exact same formulas as the asm, even though it adds some extra
+   shifts that do nothing but reduce accuracy.*/
+static int bislerp(const uint8_t *row,
+                   int pitch,
+                   int source_x,
+                   int xweight,
+                   int yweight) {
+  int a;
+  int b;
+  int c;
+  int d;
+  a = row[source_x];
+  b = row[source_x+1];
+  c = row[source_x+pitch];
+  d = row[source_x+pitch+1];
+  a = ((a<<8)+(c-a)*yweight+128)>>8;
+  b = ((b<<8)+(d-b)*yweight+128)>>8;
+  return ((a<<8)+(b-a)*xweight+128)>>8;
+}
+
+/*Convert a single pixel from Y'CbCr to RGB565.
+  This uses the exact same formulas as the asm, even though we could make the
+   constants a lot more accurate with 32-bit wide registers.*/
+static uint16_t yu2rgb565(int y, int u, int v, int dither) {
+  /*This combines the constant offset that needs to be added during the Y'CbCr
+     conversion with a rounding offset that depends on the dither parameter.*/
+  static const int DITHER_BIAS[4][3]={
+    {-14240,    8704,    -17696},
+    {-14240+128,8704+64, -17696+128},
+    {-14240+256,8704+128,-17696+256},
+    {-14240+384,8704+192,-17696+384}
+  };
+  int r;
+  int g;
+  int b;
+  r = clamped((74*y+102*v+DITHER_BIAS[dither][0])>>9, 0, 31);
+  g = clamped((74*y-25*u-52*v+DITHER_BIAS[dither][1])>>8, 0, 63);
+  b = clamped((74*y+129*u+DITHER_BIAS[dither][2])>>9, 0, 31);
+  return (uint16_t)(r<<11 | g<<5 | b);
+}
+
+static void ScaleYCbCr420ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+  int x;
+  int source_x_q16;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    int source_x;
+    int xweight;
+    int y;
+    int u;
+    int v;
+    xweight = ((source_x_q16&0xFFFF)+128)>>8;
+    source_x = source_x_q16>>16;
+    y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+    source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+    source_x_q16 += ctx->source_dx_q16;
+    u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+    v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->uv_yweight);
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr422ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+  int x;
+  int source_x_q16;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    int source_x;
+    int xweight;
+    int y;
+    int u;
+    int v;
+    xweight = ((source_x_q16&0xFFFF)+128)>>8;
+    source_x = source_x_q16>>16;
+    y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    xweight = (((source_x_q16+ctx->source_uv_xoffs_q16)&0x1FFFF)+256)>>9;
+    source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+    source_x_q16 += ctx->source_dx_q16;
+    u = bislerp(ctx->u_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+    v = bislerp(ctx->v_row, ctx->uv_pitch, source_x, xweight, ctx->y_yweight);
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr444ToRGB565_Bilinear_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+  int x;
+  int source_x_q16;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    int source_x;
+    int xweight;
+    int y;
+    int u;
+    int v;
+    xweight = ((source_x_q16&0xFFFF)+128)>>8;
+    source_x = source_x_q16>>16;
+    source_x_q16 += ctx->source_dx_q16;
+    y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    u = bislerp(ctx->u_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    v = bislerp(ctx->v_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr42xToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+  int x;
+  int source_x_q16;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    int source_x;
+    int xweight;
+    int y;
+    int u;
+    int v;
+    xweight = ((source_x_q16&0xFFFF)+128)>>8;
+    source_x = source_x_q16>>16;
+    y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+    source_x_q16 += ctx->source_dx_q16;
+    u = ctx->u_row[source_x];
+    v = ctx->v_row[source_x];
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr444ToRGB565_BilinearY_Row_C(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither){
+  int x;
+  int source_x_q16;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    int source_x;
+    int xweight;
+    int y;
+    int u;
+    int v;
+    xweight = ((source_x_q16&0xFFFF)+128)>>8;
+    source_x = source_x_q16>>16;
+    y = bislerp(ctx->y_row, ctx->y_pitch, source_x, xweight, ctx->y_yweight);
+    source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>16;
+    source_x_q16 += ctx->source_dx_q16;
+    u = ctx->u_row[source_x];
+    v = ctx->v_row[source_x];
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr42xToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+  int y;
+  int u;
+  int v;
+  int x;
+  int source_x_q16;
+  int source_x;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    source_x = source_x_q16>>16;
+    y = ctx->y_row[source_x];
+    source_x = (source_x_q16+ctx->source_uv_xoffs_q16)>>17;
+    source_x_q16 += ctx->source_dx_q16;
+    u = ctx->u_row[source_x];
+    v = ctx->v_row[source_x];
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+static void ScaleYCbCr444ToRGB565_Nearest_Row_C(
+ const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither){
+  int y;
+  int u;
+  int v;
+  int x;
+  int source_x_q16;
+  int source_x;
+  source_x_q16 = ctx->source_x0_q16;
+  for (x = 0; x < ctx->width; x++) {
+    source_x = source_x_q16>>16;
+    source_x_q16 += ctx->source_dx_q16;
+    y = ctx->y_row[source_x];
+    u = ctx->u_row[source_x];
+    v = ctx->v_row[source_x];
+    ctx->rgb_row[x] = yu2rgb565(y, u, v, dither);
+    dither ^= 3;
+  }
+}
+
+void ScaleYCbCrToRGB565(const uint8_t *y_buf,
+                                 const uint8_t *u_buf,
+                                 const uint8_t *v_buf,
+                                 uint8_t *rgb_buf,
+                                 int source_x0,
+                                 int source_y0,
+                                 int source_width,
+                                 int source_height,
+                                 int width,
+                                 int height,
+                                 int y_pitch,
+                                 int uv_pitch,
+                                 int rgb_pitch,
+                                 YUVType yuv_type,
+                                 ScaleFilter filter) {
+  int source_x0_q16;
+  int source_y0_q16;
+  int source_dx_q16;
+  int source_dy_q16;
+  int source_uv_xoffs_q16;
+  int source_uv_yoffs_q16;
+  int x_shift;
+  int y_shift;
+  int ymin;
+  int ymax;
+  int uvmin;
+  int uvmax;
+  int dither;
+  /*We don't support negative destination rectangles (just flip the source
+     instead), and for empty ones there's nothing to do.*/
+  if (width <= 0 || height <= 0)
+    return;
+  /*These bounds are required to avoid 16.16 fixed-point overflow.*/
+  NS_ASSERTION(source_x0 > (INT_MIN>>16) && source_x0 < (INT_MAX>>16),
+    "ScaleYCbCrToRGB565 source X offset out of bounds.");
+  NS_ASSERTION(source_x0+source_width > (INT_MIN>>16)
+            && source_x0+source_width < (INT_MAX>>16),
+    "ScaleYCbCrToRGB565 source width out of bounds.");
+  NS_ASSERTION(source_y0 > (INT_MIN>>16) && source_y0 < (INT_MAX>>16),
+    "ScaleYCbCrToRGB565 source Y offset out of bounds.");
+  NS_ASSERTION(source_y0+source_height > (INT_MIN>>16)
+            && source_y0+source_height < (INT_MAX>>16),
+    "ScaleYCbCrToRGB565 source height out of bounds.");
+  /*We require the same stride for Y' and Cb and Cr for 4:4:4 content.*/
+  NS_ASSERTION(yuv_type != YV24 || y_pitch == uv_pitch,
+    "ScaleYCbCrToRGB565 luma stride differs from chroma for 4:4:4 content.");
+  /*We assume we can read outside the bounds of the input, because it makes
+     the code much simpler (and in practice is true: both Theora and VP8 return
+     padded reference frames).
+    In practice, we do not even _have_ the actual bounds of the source, as
+     we are passed a crop rectangle from it, and not the dimensions of the full
+     image.
+    This assertion will not guarantee our out-of-bounds reads are safe, but it
+     should at least catch the simple case of passing in an unpadded buffer.*/
+  NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16,
+    "ScaleYCbCrToRGB565 source image unpadded?");
+  /*The NEON code requires the pointers to be aligned to a 16-byte boundary at
+     the start of each row.
+    This should be true for all of our sources.
+    We could try to fix this up if it's not true by adjusting source_x0, but
+     that would require the mis-alignment to be the same for the U and V
+     planes.*/
+  NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 &&
+   ((y_buf-(uint8_t *)nullptr)&15) == 0 &&
+   ((u_buf-(uint8_t *)nullptr)&15) == 0 &&
+   ((v_buf-(uint8_t *)nullptr)&15) == 0,
+   "ScaleYCbCrToRGB565 source image unaligned");
+  /*We take an area-based approach to pixel coverage to avoid shifting by small
+     amounts (or not so small, when up-scaling or down-scaling by a large
+     factor).
+
+    An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^.
+
+    + = RGB destination locations
+    * = Y' source locations
+    - = Cb, Cr source locations
+
+    +   +   +   +  +   +   +   +
+      *       *      *       *
+    +   +   +   +  +   +   +   +
+          -              -
+    +   +   +   +  +   +   +   +
+      *       *      *       *
+    +   +   +   +  +   +   +   +
+
+    +   +   +   +  +   +   +   +
+      *       *      *       *
+    +   +   +   +  +   +   +   +
+          -              -
+    +   +   +   +  +   +   +   +
+      *       *      *       *
+    +   +   +   +  +   +   +   +
+
+    So, the coordinates of the upper-left + (first destination site) should
+     be (-0.25,-0.25) in the source Y' coordinate system.
+    Similarly, the coordinates should be (-0.375,-0.375) in the source Cb, Cr
+     coordinate system.
+    Note that the origin and scale of these two coordinate systems is not the
+     same!
+
+    ^JPEG cositing is required for Theora; VP8 doesn't specify cositing rules,
+     but nearly all software converters in existence (at least those that are
+     open source, and many that are not) use JPEG cositing instead of MPEG.*/
+  source_dx_q16 = (source_width<<16) / width;
+  source_x0_q16 = (source_x0<<16)+(source_dx_q16>>1)-0x8000;
+  source_dy_q16 = (source_height<<16) / height;
+  source_y0_q16 = (source_y0<<16)+(source_dy_q16>>1)-0x8000;
+  x_shift = (yuv_type != YV24);
+  y_shift = (yuv_type == YV12);
+  /*These two variables hold the difference between the origins of the Y' and
+     the Cb, Cr coordinate systems, using the scale of the Y' coordinate
+     system.*/
+  source_uv_xoffs_q16 = -(x_shift<<15);
+  source_uv_yoffs_q16 = -(y_shift<<15);
+  /*Compute the range of source rows we'll actually use.
+    This doesn't guarantee we won't read outside this range.*/
+  ymin = source_height >= 0 ? source_y0 : source_y0+source_height-1;
+  ymax = source_height >= 0 ? source_y0+source_height-1 : source_y0;
+  uvmin = ymin>>y_shift;
+  uvmax = ((ymax+1+y_shift)>>y_shift)-1;
+  /*Pick a dithering pattern.
+    The "&3" at the end is just in case RAND_MAX is lying.*/
+  dither = (rand()/(RAND_MAX>>2))&3;
+  /*Nearest-neighbor scaling.*/
+  if (filter == FILTER_NONE) {
+    yuv2rgb565_row_scale_nearest_ctx ctx;
+    yuv2rgb565_row_scale_nearest_func scale_row;
+    int y;
+    /*Add rounding offsets once, in advance.*/
+    source_x0_q16 += 0x8000;
+    source_y0_q16 += 0x8000;
+    source_uv_xoffs_q16 += (x_shift<<15);
+    source_uv_yoffs_q16 += (y_shift<<15);
+    if (yuv_type == YV12)
+      scale_row = ScaleYCbCr42xToRGB565_Nearest_Row_C;
+    else
+      scale_row = ScaleYCbCr444ToRGB565_Nearest_Row_C;
+    ctx.width = width;
+    ctx.source_x0_q16 = source_x0_q16;
+    ctx.source_dx_q16 = source_dx_q16;
+    ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+    for (y=0; y<height; y++) {
+      int source_y;
+      ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+      source_y = source_y0_q16>>16;
+      source_y = clamped(source_y, ymin, ymax);
+      ctx.y_row = y_buf + source_y*y_pitch;
+      source_y = (source_y0_q16+source_uv_yoffs_q16)>>(16+y_shift);
+      source_y = clamped(source_y, uvmin, uvmax);
+      source_y0_q16 += source_dy_q16;
+      ctx.u_row = u_buf + source_y*uv_pitch;
+      ctx.v_row = v_buf + source_y*uv_pitch;
+      (*scale_row)(&ctx, dither);
+      dither ^= 2;
+    }
+  }
+  /*Bilinear scaling.*/
+  else {
+    yuv2rgb565_row_scale_bilinear_ctx ctx;
+    yuv2rgb565_row_scale_bilinear_func scale_row;
+    int uvxscale_min;
+    int uvxscale_max;
+    int uvyscale_min;
+    int uvyscale_max;
+    int y;
+    /*Check how close the chroma scaling is to unity.
+      If it's close enough, we can get away with nearest-neighbor chroma
+       sub-sampling, and only doing bilinear on luma.
+      If a given axis is subsampled, we use bounds on the luma step of
+       [0.67...2], which is equivalent to scaling chroma by [1...3].
+      If it's not subsampled, we use bounds of [0.5...1.33], which is
+       equivalent to scaling chroma by [0.75...2].
+      The lower bound is chosen as a trade-off between speed and how terrible
+       nearest neighbor looks when upscaling.*/
+# define CHROMA_NEAREST_SUBSAMP_STEP_MIN  0xAAAA
+# define CHROMA_NEAREST_NORMAL_STEP_MIN   0x8000
+# define CHROMA_NEAREST_SUBSAMP_STEP_MAX 0x20000
+# define CHROMA_NEAREST_NORMAL_STEP_MAX  0x15555
+    uvxscale_min = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvxscale_max = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    uvyscale_min = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvyscale_max = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    if (uvxscale_min <= abs(source_dx_q16)
+     && abs(source_dx_q16) <= uvxscale_max
+     && uvyscale_min <= abs(source_dy_q16)
+     && abs(source_dy_q16) <= uvyscale_max) {
+      /*Add the rounding offsets now.*/
+      source_uv_xoffs_q16 += 1<<(15+x_shift);
+      source_uv_yoffs_q16 += 1<<(15+y_shift);
+      if (yuv_type != YV24) {
+        scale_row =
+//TODO: fix NEON asm for iOS
+#  if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+         supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON :
+#  endif
+         ScaleYCbCr42xToRGB565_BilinearY_Row_C;
+      }
+      else
+        scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C;
+    }
+    else {
+      if (yuv_type == YV12)
+        scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C;
+      else if (yuv_type == YV16)
+        scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C;
+      else
+        scale_row = ScaleYCbCr444ToRGB565_Bilinear_Row_C;
+    }
+    ctx.width = width;
+    ctx.y_pitch = y_pitch;
+    ctx.source_x0_q16 = source_x0_q16;
+    ctx.source_dx_q16 = source_dx_q16;
+    ctx.source_uv_xoffs_q16 = source_uv_xoffs_q16;
+    ctx.uv_pitch = uv_pitch;
+    for (y=0; y<height; y++) {
+      int source_y;
+      int yweight;
+      int uvweight;
+      ctx.rgb_row = (uint16_t *)(rgb_buf + y*rgb_pitch);
+      source_y = (source_y0_q16+128)>>16;
+      yweight = ((source_y0_q16+128)>>8)&0xFF;
+      if (source_y < ymin) {
+        source_y = ymin;
+        yweight = 0;
+      }
+      if (source_y > ymax) {
+        source_y = ymax;
+        yweight = 0;
+      }
+      ctx.y_row = y_buf + source_y*y_pitch;
+      source_y = source_y0_q16+source_uv_yoffs_q16+(128<<y_shift);
+      source_y0_q16 += source_dy_q16;
+      uvweight = source_y>>(8+y_shift)&0xFF;
+      source_y >>= 16+y_shift;
+      if (source_y < uvmin) {
+        source_y = uvmin;
+        uvweight = 0;
+      }
+      if (source_y > uvmax) {
+        source_y = uvmax;
+        uvweight = 0;
+      }
+      ctx.u_row = u_buf + source_y*uv_pitch;
+      ctx.v_row = v_buf + source_y*uv_pitch;
+      ctx.y_yweight = yweight;
+      ctx.uv_yweight = uvweight;
+      (*scale_row)(&ctx, dither);
+      dither ^= 2;
+    }
+  }
+}
+
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+                                       int source_y0,
+                                       int source_width,
+                                       int source_height,
+                                       int width,
+                                       int height,
+                                       YUVType yuv_type,
+                                       ScaleFilter filter)
+{
+  // Very fast.
+  if (width <= 0 || height <= 0)
+    return true;
+#  if defined(MOZILLA_MAY_SUPPORT_NEON)
+  if (filter != FILTER_NONE) {
+    int source_dx_q16;
+    int source_dy_q16;
+    int uvxscale_min;
+    int uvxscale_max;
+    int uvyscale_min;
+    int uvyscale_max;
+    source_dx_q16 = (source_width<<16) / width;
+    source_dy_q16 = (source_height<<16) / height;
+    uvxscale_min = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvxscale_max = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    uvyscale_min = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvyscale_max = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    if (uvxscale_min <= abs(source_dx_q16)
+     && abs(source_dx_q16) <= uvxscale_max
+     && uvyscale_min <= abs(source_dy_q16)
+     && abs(source_dy_q16) <= uvyscale_max) {
+      if (yuv_type != YV24)
+        return supports_neon();
+    }
+  }
+#  endif
+  return false;
+}
+
+
+
+void yuv_to_rgb565_row_c(uint16 *dst,
+                         const uint8 *y,
+                         const uint8 *u,
+                         const uint8 *v,
+                         int x_shift,
+                         int pic_x,
+                         int pic_width)
+{
+  int x;
+  for (x = 0; x < pic_width; x++)
+  {
+    dst[x] = yu2rgb565(y[pic_x+x],
+                       u[(pic_x+x)>>x_shift],
+                       v[(pic_x+x)>>x_shift],
+                       2); // Disable dithering for now.
+  }
+}
+
+void ConvertYCbCrToRGB565(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int pic_x,
+                                   int pic_y,
+                                   int pic_width,
+                                   int pic_height,
+                                   int y_pitch,
+                                   int uv_pitch,
+                                   int rgb_pitch,
+                                   YUVType yuv_type)
+{
+  int x_shift;
+  int y_shift;
+  x_shift = yuv_type != YV24;
+  y_shift = yuv_type == YV12;
+//TODO: fix NEON asm for iOS
+#  if defined(MOZILLA_MAY_SUPPORT_NEON) && !defined(__APPLE__)
+  if (yuv_type != YV24 && supports_neon())
+  {
+    for (int i = 0; i < pic_height; i++) {
+      int yoffs;
+      int uvoffs;
+      yoffs = y_pitch * (pic_y+i) + pic_x;
+      uvoffs = uv_pitch * ((pic_y+i)>>y_shift) + (pic_x>>x_shift);
+      yuv42x_to_rgb565_row_neon((uint16*)(rgb_buf + rgb_pitch * i),
+                                y_buf + yoffs,
+                                u_buf + uvoffs,
+                                v_buf + uvoffs,
+                                pic_width,
+                                pic_x&x_shift);
+    }
+  }
+  else
+#  endif
+  {
+    for (int i = 0; i < pic_height; i++) {
+      int yoffs;
+      int uvoffs;
+      yoffs = y_pitch * (pic_y+i);
+      uvoffs = uv_pitch * ((pic_y+i)>>y_shift);
+      yuv_to_rgb565_row_c((uint16*)(rgb_buf + rgb_pitch * i),
+                          y_buf + yoffs,
+                          u_buf + uvoffs,
+                          v_buf + uvoffs,
+                          x_shift,
+                          pic_x,
+                          pic_width);
+    }
+  }
+}
+
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+                                         int pic_y,
+                                         int pic_width,
+                                         int pic_height,
+                                         YUVType yuv_type)
+{
+#  if defined(MOZILLA_MAY_SUPPORT_NEON)
+  return (yuv_type != YV24 && supports_neon());
+#  else
+  return false;
+#  endif
+}
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/ycbcr_to_rgb565.h b/gfx/ycbcr/ycbcr_to_rgb565.h
new file mode 100644
index 0000000000..e7839ee8bb
--- /dev/null
+++ b/gfx/ycbcr/ycbcr_to_rgb565.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef MEDIA_BASE_YCBCR_TO_RGB565_H_
+#define MEDIA_BASE_YCBCR_TO_RGB565_H_
+#include "yuv_convert.h"
+#include "mozilla/arm.h"
+
+// It's currently only worth including this if we have NEON support.
+#if defined(__arm__) && defined(MOZILLA_MAY_SUPPORT_NEON)
+#define HAVE_YCBCR_TO_RGB565 1
+#endif
+
+namespace mozilla {
+
+namespace gfx {
+
+#ifdef HAVE_YCBCR_TO_RGB565
+// Convert a frame of YUV to 16 bit RGB565.
+void ConvertYCbCrToRGB565(const uint8* yplane,
+                                   const uint8* uplane,
+                                   const uint8* vplane,
+                                   uint8* rgbframe,
+                                   int pic_x,
+                                   int pic_y,
+                                   int pic_width,
+                                   int pic_height,
+                                   int ystride,
+                                   int uvstride,
+                                   int rgbstride,
+                                   YUVType yuv_type);
+
+// Used to test if we have an accelerated version.
+bool IsConvertYCbCrToRGB565Fast(int pic_x,
+                                         int pic_y,
+                                         int pic_width,
+                                         int pic_height,
+                                         YUVType yuv_type);
+
+// Scale a frame of YUV to 16 bit RGB565.
+void ScaleYCbCrToRGB565(const uint8_t *yplane,
+                                 const uint8_t *uplane,
+                                 const uint8_t *vplane,
+                                 uint8_t *rgbframe,
+                                 int source_x0,
+                                 int source_y0,
+                                 int source_width,
+                                 int source_height,
+                                 int width,
+                                 int height,
+                                 int ystride,
+                                 int uvstride,
+                                 int rgbstride,
+                                 YUVType yuv_type,
+                                 ScaleFilter filter);
+
+// Used to test if we have an accelerated version.
+bool IsScaleYCbCrToRGB565Fast(int source_x0,
+                                       int source_y0,
+                                       int source_width,
+                                       int source_height,
+                                       int width,
+                                       int height,
+                                       YUVType yuv_type,
+                                       ScaleFilter filter);
+#endif // HAVE_YCBCR_TO_RGB565
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // MEDIA_BASE_YCBCR_TO_RGB565_H_
diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
new file mode 100644
index 0000000000..9ddc35e08c
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -0,0 +1,594 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This webpage shows layout of YV12 and other YUV formats
+// http://www.fourcc.org/yuv.php
+// The actual conversion is best described here
+// http://en.wikipedia.org/wiki/YUV
+// An article on optimizing YUV conversion using tables instead of multiplies
+// http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
+//
+// YV12 is a full plane of Y and a half height, half width chroma planes
+// YV16 is a full plane of Y and a full height, half width chroma planes
+// YV24 is a full plane of Y and a full height, full width chroma planes
+// Y8   is a full plane of Y and no chroma planes (i.e., monochrome)
+//
+// ARGB pixel format is output, which on little endian is stored as BGRA.
+// The alpha is set to 255, allowing the application to use RGBA or RGB32.
+
+#include "yuv_convert.h"
+
+#include "mozilla/StaticPrefs_gfx.h"
+#include "libyuv.h"
+#include "scale_yuv_argb.h"
+// Header for low level row functions.
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+#include "mozilla/IntegerRange.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// 16.16 fixed point arithmetic
+const int kFractionBits = 16;
+const int kFractionMax = 1 << kFractionBits;
+const int kFractionMask = ((1 << kFractionBits) - 1);
+
+// clang-format off
+
+YUVType TypeFromSize(int ywidth,
+                     int yheight,
+                     int cbcrwidth,
+                     int cbcrheight)
+{
+  if (ywidth == cbcrwidth && yheight == cbcrheight) {
+    return YV24;
+  }
+  else if ((ywidth + 1) / 2 == cbcrwidth && yheight == cbcrheight) {
+    return YV16;
+  }
+  else if ((ywidth + 1) / 2 == cbcrwidth && (yheight + 1) / 2 == cbcrheight) {
+    return YV12;
+  }
+  else if (cbcrwidth == 0 && cbcrheight == 0) {
+    return Y8;
+  }
+  else {
+    MOZ_CRASH("Can't determine YUV type from size");
+  }
+}
+
+libyuv::FourCC FourCCFromYUVType(YUVType aYUVType) {
+  switch (aYUVType) {
+    case YV24: return libyuv::FOURCC_I444;
+    case YV16: return libyuv::FOURCC_I422;
+    case YV12: return libyuv::FOURCC_I420;
+    case   Y8: return libyuv::FOURCC_I400;
+    default:   return libyuv::FOURCC_ANY;
+  }
+}
+
+int GBRPlanarToARGB(const uint8_t* src_y, int y_pitch,
+                     const uint8_t* src_u, int u_pitch,
+                     const uint8_t* src_v, int v_pitch,
+                     uint8_t* rgb_buf, int rgb_pitch,
+                     int pic_width, int pic_height) {
+  // libyuv has no native conversion function for this
+  // fixme: replace with something less awful
+  for (const auto row : IntegerRange(pic_height)) {
+    for (const auto col : IntegerRange(pic_width)) {
+      rgb_buf[rgb_pitch * row + col * 4 + 0] = src_u[u_pitch * row + col];
+      rgb_buf[rgb_pitch * row + col * 4 + 1] = src_y[y_pitch * row + col];
+      rgb_buf[rgb_pitch * row + col * 4 + 2] = src_v[v_pitch * row + col];
+      rgb_buf[rgb_pitch * row + col * 4 + 3] = 255;
+    }
+  }
+  return 0;
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32(const uint8* y_buf, const uint8* u_buf,
+                         const uint8* v_buf, uint8* rgb_buf, int pic_x,
+                         int pic_y, int pic_width, int pic_height, int y_pitch,
+                         int uv_pitch, int rgb_pitch, YUVType yuv_type,
+                         YUVColorSpace yuv_color_space) {
+  // Deprecated function's conversion is accurate.
+  // libyuv converion is a bit inaccurate to get performance. It dynamically
+  // calculates RGB from YUV to use simd. In it, signed byte is used for
+  // conversion's coefficient, but it requests 129. libyuv cut 129 to 127. And
+  // only 6 bits are used for a decimal part during the dynamic calculation.
+  //
+  // The function is still fast on some old intel chips.
+  // See Bug 1256475.
+  bool use_deprecated = StaticPrefs::gfx_ycbcr_accurate_conversion() ||
+                        (supports_mmx() && supports_sse() && !supports_sse3() &&
+                         yuv_color_space == YUVColorSpace::BT601);
+  // The deprecated function only support BT601.
+  // See Bug 1210357.
+  if (yuv_color_space != YUVColorSpace::BT601) {
+    use_deprecated = false;
+  }
+  if (use_deprecated) {
+    ConvertYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf, rgb_buf, pic_x, pic_y,
+                                   pic_width, pic_height, y_pitch, uv_pitch,
+                                   rgb_pitch, yuv_type);
+    return;
+  }
+
+  decltype(libyuv::U444ToARGB)* fConvertYUVToARGB = nullptr;
+  switch (yuv_type) {
+    case YV24: {
+      const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+      const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x;
+      const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x;
+      switch (yuv_color_space) {
+        case YUVColorSpace::BT2020:
+          fConvertYUVToARGB = libyuv::U444ToARGB;
+          break;
+        case YUVColorSpace::BT709:
+          fConvertYUVToARGB = libyuv::H444ToARGB;
+          break;
+        case YUVColorSpace::Identity:
+          fConvertYUVToARGB = GBRPlanarToARGB;
+          break;
+        default:
+          fConvertYUVToARGB = libyuv::I444ToARGB;
+          break;
+      }
+      DebugOnly<int> err =
+          fConvertYUVToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch,
+                            rgb_buf, rgb_pitch, pic_width, pic_height);
+      MOZ_ASSERT(!err);
+      break;
+    }
+    case YV16: {
+      const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+      const uint8* src_u = u_buf + uv_pitch * pic_y + pic_x / 2;
+      const uint8* src_v = v_buf + uv_pitch * pic_y + pic_x / 2;
+      switch (yuv_color_space) {
+        case YUVColorSpace::BT2020:
+          fConvertYUVToARGB = libyuv::U422ToARGB;
+          break;
+        case YUVColorSpace::BT709:
+          fConvertYUVToARGB = libyuv::H422ToARGB;
+          break;
+        default:
+          fConvertYUVToARGB = libyuv::I422ToARGB;
+          break;
+      }
+      DebugOnly<int> err =
+          fConvertYUVToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch,
+                            rgb_buf, rgb_pitch, pic_width, pic_height);
+      MOZ_ASSERT(!err);
+      break;
+    }
+    case YV12: {
+      const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+      const uint8* src_u = u_buf + (uv_pitch * pic_y + pic_x) / 2;
+      const uint8* src_v = v_buf + (uv_pitch * pic_y + pic_x) / 2;
+      switch (yuv_color_space) {
+        case YUVColorSpace::BT2020:
+          fConvertYUVToARGB = libyuv::U420ToARGB;
+          break;
+        case YUVColorSpace::BT709:
+          fConvertYUVToARGB = libyuv::H420ToARGB;
+          break;
+        default:
+          fConvertYUVToARGB = libyuv::I420ToARGB;
+          break;
+      }
+      DebugOnly<int> err =
+          fConvertYUVToARGB(src_y, y_pitch, src_u, uv_pitch, src_v, uv_pitch,
+                            rgb_buf, rgb_pitch, pic_width, pic_height);
+      MOZ_ASSERT(!err);
+      break;
+    }
+    case Y8: {
+      const uint8* src_y = y_buf + y_pitch * pic_y + pic_x;
+      MOZ_ASSERT(u_buf == nullptr);
+      MOZ_ASSERT(v_buf == nullptr);
+
+      DebugOnly<int> err =
+          libyuv::I400ToARGB(src_y, y_pitch, rgb_buf, rgb_pitch, pic_width,
+                             pic_height);
+      MOZ_ASSERT(!err);
+      break;
+    }
+    default:
+      MOZ_ASSERT_UNREACHABLE("Unsupported YUV type");
+  }
+}
+
+// Convert a frame of YUV to 32 bit ARGB.
+void ConvertYCbCrToRGB32_deprecated(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int pic_x,
+                                    int pic_y,
+                                    int pic_width,
+                                    int pic_height,
+                                    int y_pitch,
+                                    int uv_pitch,
+                                    int rgb_pitch,
+                                    YUVType yuv_type) {
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
+  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
+  bool has_sse = supports_mmx() && supports_sse();
+  // There is no optimized YV24 SSE routine so we check for this and
+  // fall back to the C code.
+  has_sse &= yuv_type != YV24;
+  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
+  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
+
+  for (int y = pic_y; y < pic_height + pic_y; ++y) {
+    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
+    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
+    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
+
+    if (odd_pic_x) {
+      // Handle the single odd pixel manually and use the
+      // fast routines for the remaining.
+      FastConvertYUVToRGB32Row_C(y_ptr++,
+                                 u_ptr++,
+                                 v_ptr++,
+                                 rgb_row,
+                                 1,
+                                 x_shift);
+      rgb_row += 4;
+    }
+
+    if (has_sse) {
+      FastConvertYUVToRGB32Row(y_ptr,
+                               u_ptr,
+                               v_ptr,
+                               rgb_row,
+                               x_width);
+    }
+    else {
+      FastConvertYUVToRGB32Row_C(y_ptr,
+                                 u_ptr,
+                                 v_ptr,
+                                 rgb_row,
+                                 x_width,
+                                 x_shift);
+    }
+  }
+
+  // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
+  if (has_sse)
+    EMMS();
+}
+
+// C version does 8 at a time to mimic MMX code
+static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                         int source_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  uint8* end = ybuf + source_width;
+  do {
+    ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+    ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+    ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+    ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+    ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+    ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+    ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+    ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+    y0_ptr += 8;
+    y1_ptr += 8;
+    ybuf += 8;
+  } while (ybuf < end);
+}
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction);
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction);
+#endif
+
+static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
+                              const uint8* y1_ptr, int source_width,
+                              int source_y_fraction) {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+  if (mozilla::supports_sse2()) {
+    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+#ifdef MOZILLA_MAY_SUPPORT_MMX
+  if (mozilla::supports_mmx()) {
+    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+    return;
+  }
+#endif
+
+  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+}
+
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int source_width,
+                       int source_height,
+                       int width,
+                       int height,
+                       int y_pitch,
+                       int uv_pitch,
+                       int rgb_pitch,
+                       YUVType yuv_type,
+                       YUVColorSpace yuv_color_space,
+                       ScaleFilter filter) {
+  bool use_deprecated =
+      StaticPrefs::gfx_ycbcr_accurate_conversion() ||
+#if defined(XP_WIN) && defined(_M_X64)
+      // libyuv does not support SIMD scaling on win 64bit. See Bug 1295927.
+      supports_sse3() ||
+#endif
+      (supports_mmx() && supports_sse() && !supports_sse3());
+  // The deprecated function only support BT601.
+  // See Bug 1210357.
+  if (yuv_color_space != YUVColorSpace::BT601) {
+    use_deprecated = false;
+  }
+  if (use_deprecated) {
+    ScaleYCbCrToRGB32_deprecated(y_buf, u_buf, v_buf,
+                                 rgb_buf,
+                                 source_width, source_height,
+                                 width, height,
+                                 y_pitch, uv_pitch,
+                                 rgb_pitch,
+                                 yuv_type,
+                                 ROTATE_0,
+                                 filter);
+    return;
+  }
+
+  DebugOnly<int> err =
+    libyuv::YUVToARGBScale(y_buf, y_pitch,
+                           u_buf, uv_pitch,
+                           v_buf, uv_pitch,
+                           FourCCFromYUVType(yuv_type),
+                           yuv_color_space,
+                           source_width, source_height,
+                           rgb_buf, rgb_pitch,
+                           width, height,
+                           libyuv::kFilterBilinear);
+  MOZ_ASSERT(!err);
+  return;
+}
+
+// Scale a frame of YUV to 32 bit ARGB.
+void ScaleYCbCrToRGB32_deprecated(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int source_width,
+                                  int source_height,
+                                  int width,
+                                  int height,
+                                  int y_pitch,
+                                  int uv_pitch,
+                                  int rgb_pitch,
+                                  YUVType yuv_type,
+                                  Rotate view_rotate,
+                                  ScaleFilter filter) {
+  bool has_mmx = supports_mmx();
+
+  // 4096 allows 3 buffers to fit in 12k.
+  // Helps performance on CPU with 16K L1 cache.
+  // Large enough for 3830x2160 and 30" displays which are 2560x1600.
+  const int kFilterBufferSize = 4096;
+  // Disable filtering if the screen is too big (to avoid buffer overflows).
+  // This should never happen to regular users: they don't have monitors
+  // wider than 4096 pixels.
+  // TODO(fbarchard): Allow rotated videos to filter.
+  if (source_width > kFilterBufferSize || view_rotate)
+    filter = FILTER_NONE;
+
+  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
+  // Diagram showing origin and direction of source sampling.
+  // ->0   4<-
+  // 7       3
+  //
+  // 6       5
+  // ->1   2<-
+  // Rotations that start at right side of image.
+  if ((view_rotate == ROTATE_180) ||
+      (view_rotate == ROTATE_270) ||
+      (view_rotate == MIRROR_ROTATE_0) ||
+      (view_rotate == MIRROR_ROTATE_90)) {
+    y_buf += source_width - 1;
+    u_buf += source_width / 2 - 1;
+    v_buf += source_width / 2 - 1;
+    source_width = -source_width;
+  }
+  // Rotations that start at bottom of image.
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_180) ||
+      (view_rotate == MIRROR_ROTATE_90) ||
+      (view_rotate == MIRROR_ROTATE_180)) {
+    y_buf += (source_height - 1) * y_pitch;
+    u_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+    v_buf += ((source_height >> y_shift) - 1) * uv_pitch;
+    source_height = -source_height;
+  }
+
+  // Handle zero sized destination.
+  if (width == 0 || height == 0)
+    return;
+  int source_dx = source_width * kFractionMax / width;
+  int source_dy = source_height * kFractionMax / height;
+  int source_dx_uv = source_dx;
+
+  if ((view_rotate == ROTATE_90) ||
+      (view_rotate == ROTATE_270)) {
+    int tmp = height;
+    height = width;
+    width = tmp;
+    tmp = source_height;
+    source_height = source_width;
+    source_width = tmp;
+    int original_dx = source_dx;
+    int original_dy = source_dy;
+    source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits;
+    source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits;
+    source_dy = original_dx;
+    if (view_rotate == ROTATE_90) {
+      y_pitch = -1;
+      uv_pitch = -1;
+      source_height = -source_height;
+    } else {
+      y_pitch = 1;
+      uv_pitch = 1;
+    }
+  }
+
+  // Need padding because FilterRows() will write 1 to 16 extra pixels
+  // after the end for SSE2 version.
+  uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
+  uint8* ybuf =
+      reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
+  uint8* ubuf = ybuf + kFilterBufferSize;
+  uint8* vbuf = ubuf + kFilterBufferSize;
+  // TODO(fbarchard): Fixed point math is off by 1 on negatives.
+  int yscale_fixed = (source_height << kFractionBits) / height;
+
+  // TODO(fbarchard): Split this into separate function for better efficiency.
+  for (int y = 0; y < height; ++y) {
+    uint8* dest_pixel = rgb_buf + y * rgb_pitch;
+    int source_y_subpixel = (y * yscale_fixed);
+    if (yscale_fixed >= (kFractionMax * 2)) {
+      source_y_subpixel += kFractionMax / 2;  // For 1/2 or less, center filter.
+    }
+    int source_y = source_y_subpixel >> kFractionBits;
+
+    const uint8* y0_ptr = y_buf + source_y * y_pitch;
+    const uint8* y1_ptr = y0_ptr + y_pitch;
+
+    const uint8* u0_ptr = u_buf + (source_y >> y_shift) * uv_pitch;
+    const uint8* u1_ptr = u0_ptr + uv_pitch;
+    const uint8* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch;
+    const uint8* v1_ptr = v0_ptr + uv_pitch;
+
+    // vertical scaler uses 16.8 fixed point
+    int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8;
+    int source_uv_fraction =
+        ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
+
+    const uint8* y_ptr = y0_ptr;
+    const uint8* u_ptr = u0_ptr;
+    const uint8* v_ptr = v0_ptr;
+    // Apply vertical filtering if necessary.
+    // TODO(fbarchard): Remove memcpy when not necessary.
+    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
+      if (yscale_fixed != kFractionMax &&
+          source_y_fraction && ((source_y + 1) < source_height)) {
+        FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+      } else {
+        memcpy(ybuf, y0_ptr, source_width);
+      }
+      y_ptr = ybuf;
+      ybuf[source_width] = ybuf[source_width-1];
+      int uv_source_width = (source_width + 1) / 2;
+      if (yscale_fixed != kFractionMax &&
+          source_uv_fraction &&
+          (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {
+        FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
+        FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
+      } else {
+        memcpy(ubuf, u0_ptr, uv_source_width);
+        memcpy(vbuf, v0_ptr, uv_source_width);
+      }
+      u_ptr = ubuf;
+      v_ptr = vbuf;
+      ubuf[uv_source_width] = ubuf[uv_source_width - 1];
+      vbuf[uv_source_width] = vbuf[uv_source_width - 1];
+    }
+    if (source_dx == kFractionMax) {  // Not scaled
+      FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                               dest_pixel, width);
+    } else if (filter & FILTER_BILINEAR_H) {
+        LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                                 dest_pixel, width, source_dx);
+    } else {
+// Specialized scalers and rotation.
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) && !defined(__clang__)
+      if(mozilla::supports_sse()) {
+        if (width == (source_width * 2)) {
+          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                  dest_pixel, width);
+        } else if ((source_dx & kFractionMask) == 0) {
+          // Scaling by integer scale factor. ie half.
+          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                   dest_pixel, width,
+                                   source_dx >> kFractionBits);
+        } else if (source_dx_uv == source_dx) {  // Not rotated.
+          ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, width, source_dx);
+        } else {
+          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
+                                         dest_pixel, width,
+                                         source_dx >> kFractionBits,
+                                         source_dx_uv >> kFractionBits);
+        }
+      }
+      else {
+        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
+                             dest_pixel, width, source_dx);
+      }
+#else
+      (void)source_dx_uv;
+      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
+                         dest_pixel, width, source_dx);
+#endif
+    }
+  }
+  // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
+  if (has_mmx)
+    EMMS();
+}
+void ConvertI420AlphaToARGB32(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* argb_buf,
+                              int pic_width,
+                              int pic_height,
+                              int ya_pitch,
+                              int uv_pitch,
+                              int argb_pitch) {
+
+  // The downstream graphics stack expects an attenuated input, hence why the
+  // attenuation parameter is set.
+  DebugOnly<int> err = libyuv::I420AlphaToARGB(y_buf, ya_pitch,
+                                               u_buf, uv_pitch,
+                                               v_buf, uv_pitch,
+                                               a_buf, ya_pitch,
+                                               argb_buf, argb_pitch,
+                                               pic_width, pic_height, 1);
+  MOZ_ASSERT(!err);
+}
+
+void ARGBAttenuate(const uint8_t* src_argb, int src_stride_argb,
+                   uint8_t* dst_argb, int dst_stride_argb, int width,
+                   int height) {
+  DebugOnly<int> err = libyuv::ARGBAttenuate(
+      src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height);
+  MOZ_ASSERT(!err);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
new file mode 100644
index 0000000000..6d2915d037
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// clang-format off
+
+#ifndef MEDIA_BASE_YUV_CONVERT_H_
+#define MEDIA_BASE_YUV_CONVERT_H_
+
+#include "chromium_types.h"
+#include "mozilla/gfx/Types.h"
+
+namespace mozilla {
+
+namespace gfx {
+
+// Type of YUV surface.
+// The value of these enums matter as they are used to shift vertical indices.
+enum YUVType {
+  YV12 = 0,           // YV12 is half width and half height chroma channels.
+  YV16 = 1,           // YV16 is half width and full height chroma channels.
+  YV24 = 2,           // YV24 is full width and full height chroma channels.
+  Y8 = 3              // Y8 is monochrome: no chroma channels.
+};
+
+// Mirror means flip the image horizontally, as in looking in a mirror.
+// Rotate happens after mirroring.
+enum Rotate {
+  ROTATE_0,           // Rotation off.
+  ROTATE_90,          // Rotate clockwise.
+  ROTATE_180,         // Rotate upside down.
+  ROTATE_270,         // Rotate counter clockwise.
+  MIRROR_ROTATE_0,    // Mirror horizontally.
+  MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
+  MIRROR_ROTATE_180,  // Mirror vertically.
+  MIRROR_ROTATE_270   // Transpose.
+};
+
+// Filter affects how scaling looks.
+enum ScaleFilter {
+  FILTER_NONE = 0,        // No filter (point sampled).
+  FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
+  FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
+  FILTER_BILINEAR = 3     // Bilinear filter.
+};
+
+YUVType TypeFromSize(int ywidth, int yheight, int cbcrwidth, int cbcrheight);
+
+// Convert a frame of YUV to 32 bit ARGB.
+// Pass in YV16/YV12 depending on source format
+void ConvertYCbCrToRGB32(const uint8* yplane,
+                         const uint8* uplane,
+                         const uint8* vplane,
+                         uint8* rgbframe,
+                         int pic_x,
+                         int pic_y,
+                         int pic_width,
+                         int pic_height,
+                         int ystride,
+                         int uvstride,
+                         int rgbstride,
+                         YUVType yuv_type,
+                         YUVColorSpace yuv_color_space);
+
+void ConvertYCbCrToRGB32_deprecated(const uint8* yplane,
+                                    const uint8* uplane,
+                                    const uint8* vplane,
+                                    uint8* rgbframe,
+                                    int pic_x,
+                                    int pic_y,
+                                    int pic_width,
+                                    int pic_height,
+                                    int ystride,
+                                    int uvstride,
+                                    int rgbstride,
+                                    YUVType yuv_type);
+
+// Scale a frame of YUV to 32 bit ARGB.
+// Supports rotation and mirroring.
+void ScaleYCbCrToRGB32(const uint8* yplane,
+                       const uint8* uplane,
+                       const uint8* vplane,
+                       uint8* rgbframe,
+                       int source_width,
+                       int source_height,
+                       int width,
+                       int height,
+                       int ystride,
+                       int uvstride,
+                       int rgbstride,
+                       YUVType yuv_type,
+                       YUVColorSpace yuv_color_space,
+                       ScaleFilter filter);
+
+void ScaleYCbCrToRGB32_deprecated(const uint8* yplane,
+                                  const uint8* uplane,
+                                  const uint8* vplane,
+                                  uint8* rgbframe,
+                                  int source_width,
+                                  int source_height,
+                                  int width,
+                                  int height,
+                                  int ystride,
+                                  int uvstride,
+                                  int rgbstride,
+                                  YUVType yuv_type,
+                                  Rotate view_rotate,
+                                  ScaleFilter filter);
+
+void ConvertI420AlphaToARGB32(const uint8* yplane,
+                              const uint8* uplane,
+                              const uint8* vplane,
+                              const uint8* aplane,
+                              uint8* argbframe,
+                              int pic_width,
+                              int pic_height,
+                              int yastride,
+                              int uvstride,
+                              int argbstride);
+
+void ARGBAttenuate(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif  // MEDIA_BASE_YUV_CONVERT_H_
diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
new file mode 100644
index 0000000000..081343b0b1
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_arm.cpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
+
+#include "yuv_convert.h"
+#include "ycbcr_to_rgb565.h"
+
+
+
+#ifdef HAVE_YCBCR_TO_RGB565
+
+namespace mozilla {
+
+namespace gfx {
+
+#  if defined(MOZILLA_MAY_SUPPORT_NEON)
+#  if defined(__clang__)
+void __attribute((noinline))
+#  else
+void __attribute((noinline,optimize("-fomit-frame-pointer")))
+#  endif
+    yuv42x_to_rgb565_row_neon(uint16 *dst,
+                              const uint8 *y,
+                              const uint8 *u,
+                              const uint8 *v,
+                              int n,
+                              int oddflag)
+{
+    static __attribute__((aligned(16))) uint16 acc_r[8] = {
+        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
+    };
+    static __attribute__((aligned(16))) uint16 acc_g[8] = {
+        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
+    };
+    static __attribute__((aligned(16))) uint16 acc_b[8] = {
+        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
+    };
+    /*
+     * Registers:
+     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
+     * q2     : d4, d5          - are used for storing converted RGB data
+     * q3     : d6, d7          - are used for temporary storage
+     *
+     * q4-q7 - reserved
+     *
+     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
+     * q10    : d20, d21
+     * q11    : d22, d23
+     * q12    : d24, d25
+     * q13    : d26, d27
+     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
+     */
+    asm volatile (
+".fpu neon\n"
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+".arch armv7-a\n"
+".object_arch armv4t\n"
+".macro convert_macroblock size\n"
+/* load up to 16 source pixels */
+	".if \\size == 16\n"
+	    "pld [%[y], #64]\n"
+	    "pld [%[u], #64]\n"
+	    "pld [%[v], #64]\n"
+	    "vld1.8 {d1}, [%[y]]!\n"
+	    "vld1.8 {d3}, [%[y]]!\n"
+	    "vld1.8 {d0}, [%[u]]!\n"
+	    "vld1.8 {d2}, [%[v]]!\n"
+	".elseif \\size == 8\n"
+	    "vld1.8 {d1}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d0[1]}, [%[u]]!\n"
+	    "vld1.8 {d0[2]}, [%[u]]!\n"
+	    "vld1.8 {d0[3]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	    "vld1.8 {d2[1]}, [%[v]]!\n"
+	    "vld1.8 {d2[2]}, [%[v]]!\n"
+	    "vld1.8 {d2[3]}, [%[v]]!\n"
+	".elseif \\size == 4\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d1[1]}, [%[y]]!\n"
+	    "vld1.8 {d1[2]}, [%[y]]!\n"
+	    "vld1.8 {d1[3]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d0[1]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	    "vld1.8 {d2[1]}, [%[v]]!\n"
+	".elseif \\size == 2\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d1[1]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	".elseif \\size == 1\n"
+	    "vld1.8 {d1[0]}, [%[y]]!\n"
+	    "vld1.8 {d0[0]}, [%[u]]!\n"
+	    "vld1.8 {d2[0]}, [%[v]]!\n"
+	".else\n"
+	    ".error \"unsupported macroblock size\"\n"
+	".endif\n"
+
+        /* d1 - Y data (first 8 bytes) */
+        /* d3 - Y data (next 8 bytes) */
+        /* d0 - U data, d2 - V data */
+
+	/* split even and odd Y color components */
+	"vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
+	/* clip upper and lower boundaries */
+	"vqadd.u8    q0, q0, q4\n"
+	"vqadd.u8    q1, q1, q4\n"
+	"vqsub.u8    q0, q0, q5\n"
+	"vqsub.u8    q1, q1, q5\n"
+
+	"vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
+
+	"vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
+	"vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
+
+	"vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
+	"vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
+	"vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
+	"vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
+	"vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
+	"vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
+	"vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
+	"vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
+	"vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
+
+	"vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
+	"vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
+	"vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
+	"vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
+
+	"vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
+	"vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
+	"vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
+	"vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
+
+	"vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
+	"vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
+	"vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
+	"vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
+
+	"vzip.8      d0, d3\n"                       /* join even and odd red components */
+	"vzip.8      d1, d4\n"                       /* join even and odd green components */
+	"vzip.8      d2, d5\n"                       /* join even and odd blue components */
+
+	"vshll.u8    q3, d0, #8\n\t"
+	"vshll.u8    q8, d1, #8\n\t"
+	"vshll.u8    q9, d2, #8\n\t"
+	"vsri.u16    q3, q8, #5\t\n"
+	"vsri.u16    q3, q9, #11\t\n"
+	/* store pixel data to memory */
+	".if \\size == 16\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	"    vshll.u8    q3, d3, #8\n\t"
+	"    vshll.u8    q8, d4, #8\n\t"
+	"    vshll.u8    q9, d5, #8\n\t"
+	"    vsri.u16    q3, q8, #5\t\n"
+	"    vsri.u16    q3, q9, #11\t\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	".elseif \\size == 8\n"
+	"    vst1.16 {d6, d7}, [%[dst]]!\n"
+	".elseif \\size == 4\n"
+	"    vst1.16 {d6}, [%[dst]]!\n"
+	".elseif \\size == 2\n"
+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
+	"    vst1.16 {d6[1]}, [%[dst]]!\n"
+	".elseif \\size == 1\n"
+	"    vst1.16 {d6[0]}, [%[dst]]!\n"
+	".endif\n"
+	".endm\n"
+
+	"vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
+	"vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
+	"vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
+	"vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
+
+	"vmov.u8     d26, #16\n"
+	"vmov.u8     d27, #149\n"
+	"vmov.u8     d28, #204\n"
+	"vmov.u8     d29, #50\n"
+	"vmov.u8     d30, #104\n"
+	"vmov.u8     d31, #154\n"
+
+	"cmp         %[oddflag], #0\n"
+	"beq         1f\n"
+	"convert_macroblock 1\n"
+	"sub         %[n], %[n], #1\n"
+    "1:\n"
+	"subs        %[n], %[n], #16\n"
+	"blt         2f\n"
+    "1:\n"
+	"convert_macroblock 16\n"
+	"subs        %[n], %[n], #16\n"
+	"bge         1b\n"
+    "2:\n"
+	"tst         %[n], #8\n"
+	"beq         3f\n"
+	"convert_macroblock 8\n"
+    "3:\n"
+	"tst         %[n], #4\n"
+	"beq         4f\n"
+	"convert_macroblock 4\n"
+    "4:\n"
+	"tst         %[n], #2\n"
+	"beq         5f\n"
+	"convert_macroblock 2\n"
+    "5:\n"
+	"tst         %[n], #1\n"
+	"beq         6f\n"
+	"convert_macroblock 1\n"
+    "6:\n"
+	".purgem convert_macroblock\n"
+	: [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
+	: [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
+	  [oddflag] "r" (oddflag)
+	: "cc", "memory",
+	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+	  "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
+	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+    );
+}
+#  endif // MOZILLA_MAY_SUPPORT_NEON
+
+} // namespace gfx
+
+} // namespace mozilla
+
+#endif // HAVE_YCBCR_TO_RGB565
diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
new file mode 100644
index 0000000000..b5353e5008
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_mmx.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <mmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// MMX version does 8 pixels at a time.
+void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                    int source_width, int source_y_fraction) {
+  __m64 zero = _mm_setzero_si64();
+  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+  do {
+    __m64 y0 = *y0_ptr64++;
+    __m64 y1 = *y1_ptr64++;
+    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+    y0 = _mm_unpacklo_pi8(y0, zero);
+    y1 = _mm_unpacklo_pi8(y1, zero);
+    y0 = _mm_mullo_pi16(y0, y0_fraction);
+    y1 = _mm_mullo_pi16(y1, y1_fraction);
+    y2 = _mm_mullo_pi16(y2, y0_fraction);
+    y3 = _mm_mullo_pi16(y3, y1_fraction);
+    y0 = _mm_add_pi16(y0, y1);
+    y2 = _mm_add_pi16(y2, y3);
+    y0 = _mm_srli_pi16(y0, 8);
+    y2 = _mm_srli_pi16(y2, 8);
+    y0 = _mm_packs_pu16(y0, y2);
+    *dest64++ = y0;
+  } while (dest64 < end64);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
new file mode 100644
index 0000000000..25fe20639d
--- /dev/null
+++ b/gfx/ycbcr/yuv_convert_sse2.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <emmintrin.h>
+#include "yuv_row.h"
+
+namespace mozilla {
+namespace gfx {
+
+// FilterRows combines two rows of the image using linear interpolation.
+// SSE2 version does 16 pixels at a time.
+void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+  do {
+    __m128i y0 = _mm_loadu_si128(y0_ptr128);
+    __m128i y1 = _mm_loadu_si128(y1_ptr128);
+    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+    y0 = _mm_unpacklo_epi8(y0, zero);
+    y1 = _mm_unpacklo_epi8(y1, zero);
+    y0 = _mm_mullo_epi16(y0, y0_fraction);
+    y1 = _mm_mullo_epi16(y1, y1_fraction);
+    y2 = _mm_mullo_epi16(y2, y0_fraction);
+    y3 = _mm_mullo_epi16(y3, y1_fraction);
+    y0 = _mm_add_epi16(y0, y1);
+    y2 = _mm_add_epi16(y2, y3);
+    y0 = _mm_srli_epi16(y0, 8);
+    y2 = _mm_srli_epi16(y2, 8);
+    y0 = _mm_packus_epi16(y0, y2);
+    *dest128++ = y0;
+    ++y0_ptr128;
+    ++y1_ptr128;
+  } while (dest128 < end128);
+}
+
+} // namespace gfx
+} // namespace mozilla
diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
new file mode 100644
index 0000000000..f6fa139127
--- /dev/null
+++ b/gfx/ycbcr/yuv_row.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// yuv_row internal functions to handle YUV conversion and scaling to RGB.
+// These functions are used from both yuv_convert.cc and yuv_scale.cc.
+
+// TODO(fbarchard): Write function that can handle rotation and scaling.
+
+#ifndef MEDIA_BASE_YUV_ROW_H_
+#define MEDIA_BASE_YUV_ROW_H_
+
+#include "chromium_types.h"
+
+extern "C" {
+// Can only do 1x.
+// This is the second fastest of the scalers.
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                unsigned int x_shift);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+
+// Can do 1x, half size or any scale down by an integer amount.
+// Step can be negative (mirroring, rotate 180).
+// This is the third fastest of the scalers.
+// Only defined on Windows x86-32.
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step);
+
+// Rotate is like Convert, but applies different step to Y versus U and V.
+// This allows rotation by 90 or 270, by stepping by stride.
+// This is the forth fastest of the scalers.
+// Only defined on Windows x86-32.
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep);
+
+// Doubler does 4 pixels at a time.  Each pixel is replicated.
+// This is the fastest of the scalers.
+// Only defined on Windows x86-32.
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+// Handles arbitrary scaling up or down.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx);
+
+// Handles arbitrary scaling up or down with bilinear filtering.
+// Mirroring is supported, but not 90 or 270 degree rotation.
+// Chroma is under sampled every 2 pixels for performance.
+// This is the slowest of the scalers.
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx);
+
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#endif
+#elif defined(__GNUC__) || defined(__clang__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+#else
+#define SIMD_ALIGNED(var) var
+#endif
+
+extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+
+// x64 uses MMX2 (SSE) so emms is not required.
+// Warning C4799: function has no EMMS instruction.
+// EMMS() is slow and should be called by the calling function once per image.
+#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
+#if defined(_MSC_VER)
+#define EMMS() __asm emms
+#pragma warning(disable: 4799)
+#else
+#define EMMS() asm("emms")
+#endif
+#else
+#define EMMS() ((void)0)
+#endif
+
+}  // extern "C"
+
+#endif  // MEDIA_BASE_YUV_ROW_H_
diff --git a/gfx/ycbcr/yuv_row_arm.s b/gfx/ycbcr/yuv_row_arm.s
new file mode 100644
index 0000000000..6a6c81beeb
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_arm.s
@@ -0,0 +1,304 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+    .arch   armv7-a
+    .fpu    neon
+/* Allow to build on targets not supporting neon, and force the object file
+ * target to avoid bumping the final binary target */
+    .object_arch armv4t
+    .text
+    .align
+
+    .balign 64
+YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
+    .short -14240
+    .short -14240+384
+    .short   8672
+    .short   8672+192
+    .short -17696
+    .short -17696+384
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
+    .short -14240+128
+    .short -14240+256
+    .short   8672+64
+    .short   8672+128
+    .short -17696+128
+    .short -17696+256
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
+    .short -14240+256
+    .short -14240+128
+    .short   8672+128
+    .short   8672+64
+    .short -17696+256
+    .short -17696+128
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
+    .short -14240+384
+    .short -14240
+    .short   8672+192
+    .short   8672
+    .short -17696+384
+    .short -17696
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+
+@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+@  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+@
+@ ctx = {
+@   uint16_t *rgb_row;       /*r0*/
+@   const uint8_t *y_row;    /*r1*/
+@   const uint8_t *u_row;    /*r2*/
+@   const uint8_t *v_row;    /*r3*/
+@   int y_yweight;           /*r4*/
+@   int y_pitch;             /*r5*/
+@   int width;               /*r6*/
+@   int source_x0_q16;       /*r7*/
+@   int source_dx_q16;       /*r8*/
+@   int source_uv_xoffs_q16; /*r9*/
+@ };
+    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
+    .balign 64
+    .fnstart
+ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
+    STMFD       r13!,{r4-r9,r14}       @ 8 words.
+    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
+    VPUSH       {Q4-Q7}                @ 16 words.
+    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
+    LDMIA       r0, {r0-r9}
+    @ Set up image index registers.
+    ADD         r12,r8, r8
+    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
+    VDUP.32     D17,r12
+    ADD         r12,r12,r12
+    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
+    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
+    ADD         r12,r12,r12
+    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
+    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
+    CMP         r8, #0                 @ If source_dx_q16 is negative...
+    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
+    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
+    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
+    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
+    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
+    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
+    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
+    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
+    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
+    VMOV.I8     D28,#52
+    VMOV.I8     D29,#129
+    @ The basic idea here is to do aligned loads of a block of data and then
+    @  index into it using VTBL to extract the data from the source X
+    @  coordinate corresponding to each destination pixel.
+    @ This is significantly less code and significantly fewer cycles than doing
+    @  a series of single-lane loads, but it means that the X step between
+    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
+    @  that we could read 8 pixels from a single aligned 32-byte block of data.
+    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
+    @  separated into even pixels and odd pixels to make extracting offsets and
+    @  weights easier.
+    @ We then pull out two bytes from the middle of each coordinate: the top
+    @  byte corresponds to the integer part of the X coordinate, and the bottom
+    @  byte corresponds to the weight to use for bilinear blending.
+    @ These are separated out into different registers with VTRN.
+    @ Then by subtracting the integer X coordinate of the first pixel in the
+    @  data block we loaded, we produce an index register suitable for use by
+    @  VTBL.
+s42xbily_neon_loop:
+    @ Load the Y' data.
+    MOV         r12,r7, ASR #16
+    VRSHRN.S32  D16,Q0, #8
+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
+    VDUP.I8     D20,r12
+    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
+    VRSHRN.S32  D17,Q1, #8
+    PLD         [r12,#64]
+    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
+    ADD         r14,r7, r8, LSL #3
+    VRSHRN.S32  D18,Q2, #8
+    MOV         r14,r14,ASR #16
+    VRSHRN.S32  D19,Q3, #8
+    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
+    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
+    PLD         [r12,#64]
+    VDUP.I8     D21,r14
+    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
+    VMOV.I8     Q13,#1
+    PLD         [r14,#64]
+    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
+    @ First 8 Y' pixels
+    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
+    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
+    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
+    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
+    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
+    @ Next 8 Y' pixels
+    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
+    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
+    PLD         [r14,#64]
+    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
+    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
+    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
+    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
+    @ Blend Y'.
+    VDUP.I16    Q9, r4         @ Load the y weights.
+    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
+    VSUBL.U8    Q5, D25,D21
+    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
+    VSUBL.U8    Q7, D27,D23
+    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
+    VMUL.S16    Q5, Q5, Q9
+    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
+    VMUL.S16    Q7, Q7, Q9
+    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
+    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
+    VRSHRN.S16  D9, Q5, #8
+    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
+    VRSHRN.S16  D13,Q7, #8
+    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
+    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
+    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
+    VSUBL.U8    Q5, D23,D21
+    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
+    VMUL.S16    Q5, Q5, Q13
+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
+    ADD         r12,r7, r9
+    VRSHRN.S16  D9, Q5, #8
+    MOV         r12,r12,ASR #17
+    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
+    @ Start extracting the chroma x coordinates, and load Cb and Cr.
+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
+    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
+    ADD         r14,r2, r12
+    VADD.I32    Q10,Q0, Q9
+    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
+    PLD         [r14,#64]
+    VADD.I32    Q11,Q1, Q9
+    ADD         r14,r3, r12
+    VADD.I32    Q12,Q2, Q9
+    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
+    PLD         [r14,#64]
+    VADD.I32    Q13,Q3, Q9
+    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
+    VRSHRN.S32  D21,Q11,#9
+    VDUP.I8     Q9, r12
+    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
+    VRSHRN.S32  D23,Q13,#9
+    @ We don't actually need the x weights, but we get them for free.
+    @ Free ALU slot
+    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
+    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
+    VMOV.I8     D24,#74
+    VTBL.8      D19,{D8, D9, D10,D11},D23
+    VMOV.I8     D26,#102
+    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
+    VMOV.I8     D27,#25
+    VTBL.8      D21,{D12,D13,D14,D15},D23
+    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
+    @ We use VDUP to expand constants, because it's a permute instruction, so
+    @  it can dual issue on the A8.
+    SUBS        r6, r6, #16    @ width -= 16
+    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
+    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
+    VMULL.U8    Q5, D17,D24
+    VDUP.32     Q7, D30[1]
+    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
+    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
+    VMLSL.U8    Q7, D19,D27
+    VDUP.32     Q12,D30[0]
+    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
+    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B
+    VMLAL.U8    Q12,D21,D26
+    VDUP.32     Q13,D31[0]
+    VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B
+    VMLAL.U8    Q13,D19,D29
+    VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G
+    VMLSL.U8    Q7, D21,D28
+    VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R
+    VADD.S16    Q12,Q5, Q12
+    VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B
+    VQADD.S16   Q13,Q5, Q13
+    VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G
+    VADD.S16    Q7, Q5, Q7
+    @ Push each value to the top of its word and saturate it.
+    VQSHLU.S16 Q11,Q11,#2
+    VQSHLU.S16 Q12,Q12,#2
+    VQSHLU.S16 Q6, Q6, #2
+    VQSHLU.S16 Q7, Q7, #2
+    VQSHLU.S16 Q8, Q8, #2
+    VQSHLU.S16 Q13,Q13,#2
+    @ Merge G and B into R.
+    VSRI.U16   Q11,Q6, #5
+    VSRI.U16   Q12,Q7, #5
+    VSRI.U16   Q11,Q8, #11
+    MOV         r14,r8, LSL #4
+    VSRI.U16   Q12,Q13,#11
+    BLT s42xbily_neon_tail
+    VDUP.I32    Q13,r14
+    @ Store the result.
+    VST1.16     {D22,D23,D24,D25},[r0]!
+    BEQ s42xbily_neon_done
+    @ Advance the x coordinates.
+    VADD.I32    Q0, Q0, Q13
+    VADD.I32    Q1, Q1, Q13
+    ADD         r7, r14
+    VADD.I32    Q2, Q2, Q13
+    VADD.I32    Q3, Q3, Q13
+    B s42xbily_neon_loop
+s42xbily_neon_tail:
+    @ We have between 1 and 15 pixels left to write.
+    @ -r6 == the number of pixels we need to skip writing.
+    @ Adjust r0 to point to the last one we need to write, because we're going
+    @  to write them in reverse order.
+    ADD         r0, r0, r6, LSL #1
+    MOV         r14,#-2
+    ADD         r0, r0, #30
+    @ Skip past the ones we don't need to write.
+    SUB         PC, PC, r6, LSL #2
+    ORR         r0, r0, r0
+    VST1.16     {D25[3]},[r0,:16],r14
+    VST1.16     {D25[2]},[r0,:16],r14
+    VST1.16     {D25[1]},[r0,:16],r14
+    VST1.16     {D25[0]},[r0,:16],r14
+    VST1.16     {D24[3]},[r0,:16],r14
+    VST1.16     {D24[2]},[r0,:16],r14
+    VST1.16     {D24[1]},[r0,:16],r14
+    VST1.16     {D24[0]},[r0,:16],r14
+    VST1.16     {D23[3]},[r0,:16],r14
+    VST1.16     {D23[2]},[r0,:16],r14
+    VST1.16     {D23[1]},[r0,:16],r14
+    VST1.16     {D23[0]},[r0,:16],r14
+    VST1.16     {D22[3]},[r0,:16],r14
+    VST1.16     {D22[2]},[r0,:16],r14
+    VST1.16     {D22[1]},[r0,:16],r14
+    VST1.16     {D22[0]},[r0,:16]
+s42xbily_neon_done:
+    VPOP        {Q4-Q7}                @ 16 words.
+    LDMFD       r13!,{r4-r9,PC}        @ 8 words.
+    .fnend
+    .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+
+#if defined(__ELF__)&&defined(__linux__)
+    .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
new file mode 100644
index 0000000000..d327f854ee
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_c.cpp
@@ -0,0 +1,133 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YuvPixel(uint8 y,
+                            uint8 u,
+                            uint8 v,
+                            uint8* rgb_buf) {
+
+  int b = kCoefficientsRgbY[256+u][0];
+  int g = kCoefficientsRgbY[256+u][1];
+  int r = kCoefficientsRgbY[256+u][2];
+  int a = kCoefficientsRgbY[256+u][3];
+
+  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+  b = paddsw(b, kCoefficientsRgbY[y][0]);
+  g = paddsw(g, kCoefficientsRgbY[y][1]);
+  r = paddsw(r, kCoefficientsRgbY[y][2]);
+  a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+  b >>= 6;
+  g >>= 6;
+  r >>= 6;
+  a >>= 6;
+
+  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+                                        (packuswb(g) << 8) |
+                                        (packuswb(r) << 16) |
+                                        (packuswb(a) << 24);
+}
+
+void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                unsigned int x_shift) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> x_shift];
+    uint8 v = v_buf[x >> x_shift];
+    uint8 y0 = y_buf[x];
+    YuvPixel(y0, u, v, rgb_buf);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      if (x_shift == 0) {
+        u = u_buf[x + 1];
+        v = v_buf[x + 1];
+      }
+      YuvPixel(y1, u, v, rgb_buf + 4);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+// 16.16 fixed point is used.  A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx) {
+  int x = 0;
+  for (int i = 0; i < width; i += 2) {
+    int y = y_buf[x >> 16];
+    int u = u_buf[(x >> 17)];
+    int v = v_buf[(x >> 17)];
+    YuvPixel(y, u, v, rgb_buf);
+    x += source_dx;
+    if ((i + 1) < width) {
+      y = y_buf[x >> 16];
+      YuvPixel(y, u, v, rgb_buf+4);
+      x += source_dx;
+    }
+    rgb_buf += 8;
+  }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx) {
+  int x = 0;
+  if (source_dx >= 0x20000) {
+    x = 32768;
+  }
+  for (int i = 0; i < width; i += 2) {
+    int y0 = y_buf[x >> 16];
+    int y1 = y_buf[(x >> 16) + 1];
+    int u0 = u_buf[(x >> 17)];
+    int u1 = u_buf[(x >> 17) + 1];
+    int v0 = v_buf[(x >> 17)];
+    int v1 = v_buf[(x >> 17) + 1];
+    int y_frac = (x & 65535);
+    int uv_frac = ((x >> 1) & 65535);
+    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+    YuvPixel(y, u, v, rgb_buf);
+    x += source_dx;
+    if ((i + 1) < width) {
+      y0 = y_buf[x >> 16];
+      y1 = y_buf[(x >> 16) + 1];
+      y_frac = (x & 65535);
+      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+      YuvPixel(y, u, v, rgb_buf+4);
+      x += source_dx;
+    }
+    rgb_buf += 8;
+  }
+}
+
+}  // extern "C"
+
diff --git a/gfx/ycbcr/yuv_row_other.cpp b/gfx/ycbcr/yuv_row_other.cpp
new file mode 100644
index 0000000000..c351139f90
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_other.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+} 
diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
new file mode 100644
index 0000000000..30f1b6e3ba
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -0,0 +1,914 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define DCHECK(a)
+
+extern "C" {
+
+#if defined(ARCH_CPU_X86_64)
+
+// We don't need CPUID guards here, since x86-64 implies SSE2.
+
+// AMD64 ABI uses register paremters.
+void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+                              const uint8* u_buf,  // rsi
+                              const uint8* v_buf,  // rdx
+                              uint8* rgb_buf,      // rcx
+                              int width) {         // r8
+  asm volatile(
+  "jmp    1f\n"
+"0:"
+  "movzb  (%[u_buf]),%%r10\n"
+  "add    $0x1,%[u_buf]\n"
+  "movzb  (%[v_buf]),%%r11\n"
+  "add    $0x1,%[v_buf]\n"
+  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
+  "movzb  (%[y_buf]),%%r10\n"
+  "movq   4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n"
+  "movzb  0x1(%[y_buf]),%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n"
+  "add    $0x2,%[y_buf]\n"
+  "movq   (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "paddsw %%xmm0,%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%[rgb_buf])\n"
+  "add    $0x8,%[rgb_buf]\n"
+"1:"
+  "sub    $0x2,%[width]\n"
+  "jns    0b\n"
+
+"2:"
+  "add    $0x1,%[width]\n"
+  "js     3f\n"
+
+  "movzb  (%[u_buf]),%%r10\n"
+  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
+  "movzb  (%[v_buf]),%%r10\n"
+  "movq   4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movzb  (%[y_buf]),%%r10\n"
+  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movd   %%xmm1,0x0(%[rgb_buf])\n"
+"3:"
+  : [y_buf] "+r"(y_buf),
+    [u_buf] "+r"(u_buf),
+    [v_buf] "+r"(v_buf),
+    [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
+  : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
+  : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
+                        const uint8* u_buf,  // rsi
+                        const uint8* v_buf,  // rdx
+                        uint8* rgb_buf,      // rcx
+                        int width,           // r8
+                        int source_dx) {     // r9
+  asm volatile(
+  "xor    %%r11,%%r11\n"
+  "sub    $0x2,%[width]\n"
+  "js     1f\n"
+
+"0:"
+  "mov    %%r11,%%r10\n"
+  "sar    $0x11,%%r10\n"
+  "movzb  (%[u_buf],%%r10,1),%%rax\n"
+  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
+  "movzb  (%[v_buf],%%r10,1),%%rax\n"
+  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+  "lea    (%%r11,%[source_dx]),%%r10\n"
+  "sar    $0x10,%%r11\n"
+  "movzb  (%[y_buf],%%r11,1),%%rax\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+  "lea    (%%r10,%[source_dx]),%%r11\n"
+  "sar    $0x10,%%r10\n"
+  "movzb  (%[y_buf],%%r10,1),%%rax\n"
+  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movq   %%xmm1,0x0(%[rgb_buf])\n"
+  "add    $0x8,%[rgb_buf]\n"
+  "sub    $0x2,%[width]\n"
+  "jns    0b\n"
+
+"1:"
+  "add    $0x1,%[width]\n"
+  "js     2f\n"
+
+  "mov    %%r11,%%r10\n"
+  "sar    $0x11,%%r10\n"
+  "movzb  (%[u_buf],%%r10,1),%%rax\n"
+  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
+  "movzb  (%[v_buf],%%r10,1),%%rax\n"
+  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "sar    $0x10,%%r11\n"
+  "movzb  (%[y_buf],%%r11,1),%%rax\n"
+  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
+  "paddsw %%xmm0,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movd   %%xmm1,0x0(%[rgb_buf])\n"
+
+"2:"
+  : [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
+  : [y_buf] "r"(y_buf),
+    [u_buf] "r"(u_buf),
+    [v_buf] "r"(v_buf),
+    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
+    [source_dx] "r"(static_cast<long>(source_dx))
+  : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  asm volatile(
+  "xor    %%r11,%%r11\n"   // x = 0
+  "sub    $0x2,%[width]\n"
+  "js     2f\n"
+  "cmp    $0x20000,%[source_dx]\n"   // if source_dx >= 2.0
+  "jl     0f\n"
+  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
+"0:"
+
+"1:"
+  "mov    %%r11,%%r10\n"
+  "sar    $0x11,%%r10\n"
+
+  "movzb  (%[u_buf], %%r10, 1), %%r13 \n"
+  "movzb  1(%[u_buf], %%r10, 1), %%r14 \n"
+  "mov    %%r11, %%rax \n"
+  "and    $0x1fffe, %%rax \n"
+  "imul   %%rax, %%r14 \n"
+  "xor    $0x1fffe, %%rax \n"
+  "imul   %%rax, %%r13 \n"
+  "add    %%r14, %%r13 \n"
+  "shr    $17, %%r13 \n"
+  "movq   2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n"
+
+  "movzb  (%[v_buf], %%r10, 1), %%r13 \n"
+  "movzb  1(%[v_buf], %%r10, 1), %%r14 \n"
+  "mov    %%r11, %%rax \n"
+  "and    $0x1fffe, %%rax \n"
+  "imul   %%rax, %%r14 \n"
+  "xor    $0x1fffe, %%rax \n"
+  "imul   %%rax, %%r13 \n"
+  "add    %%r14, %%r13 \n"
+  "shr    $17, %%r13 \n"
+  "movq   4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n"
+
+  "mov    %%r11, %%rax \n"
+  "lea    (%%r11,%[source_dx]),%%r10\n"
+  "sar    $0x10,%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+
+  "movzb  (%[y_buf], %%r11, 1), %%r13 \n"
+  "movzb  1(%[y_buf], %%r11, 1), %%r14 \n"
+  "and    $0xffff, %%rax \n"
+  "imul   %%rax, %%r14 \n"
+  "xor    $0xffff, %%rax \n"
+  "imul   %%rax, %%r13 \n"
+  "add    %%r14, %%r13 \n"
+  "shr    $16, %%r13 \n"
+  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+  "mov    %%r10, %%rax \n"
+  "lea    (%%r10,%[source_dx]),%%r11\n"
+  "sar    $0x10,%%r10\n"
+
+  "movzb  (%[y_buf],%%r10,1), %%r13 \n"
+  "movzb  1(%[y_buf],%%r10,1), %%r14 \n"
+  "and    $0xffff, %%rax \n"
+  "imul   %%rax, %%r14 \n"
+  "xor    $0xffff, %%rax \n"
+  "imul   %%rax, %%r13 \n"
+  "add    %%r14, %%r13 \n"
+  "shr    $16, %%r13 \n"
+  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n"
+
+  "paddsw %%xmm0,%%xmm1\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movq   %%xmm1,0x0(%[rgb_buf])\n"
+  "add    $0x8,%[rgb_buf]\n"
+  "sub    $0x2,%[width]\n"
+  "jns    1b\n"
+
+"2:"
+  "add    $0x1,%[width]\n"
+  "js     3f\n"
+
+  "mov    %%r11,%%r10\n"
+  "sar    $0x11,%%r10\n"
+
+  "movzb  (%[u_buf],%%r10,1), %%r13 \n"
+  "movq   2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n"
+
+  "movzb  (%[v_buf],%%r10,1), %%r13 \n"
+  "movq   4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+  "paddsw %%xmm1,%%xmm0\n"
+  "sar    $0x10,%%r11\n"
+
+  "movzb  (%[y_buf],%%r11,1), %%r13 \n"
+  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
+
+  "paddsw %%xmm0,%%xmm1\n"
+  "psraw  $0x6,%%xmm1\n"
+  "packuswb %%xmm1,%%xmm1\n"
+  "movd   %%xmm1,0x0(%[rgb_buf])\n"
+
+"3:"
+  : [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
+  : [y_buf] "r"(y_buf),
+    [u_buf] "r"(u_buf),
+    [v_buf] "r"(v_buf),
+    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
+    [source_dx] "r"(static_cast<long>(source_dx))
+  : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+
+// PIC version is slower because less registers are available, so
+// non-PIC is used on platforms where it is possible.
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+  asm(
+  ".text\n"
+  ".global FastConvertYUVToRGB32Row_SSE\n"
+  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
+"FastConvertYUVToRGB32Row_SSE:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+  "jmp    1f\n"
+
+"0:"
+  "movzbl (%edi),%eax\n"
+  "add    $0x1,%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "add    $0x1,%esi\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "add    $0x2,%edx\n"
+  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"1:"
+  "sub    $0x2,%ecx\n"
+  "jns    0b\n"
+
+  "and    $0x1,%ecx\n"
+  "je     2f\n"
+
+  "movzbl (%edi),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%esi),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+"2:"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx);
+  asm(
+  ".text\n"
+  ".global ScaleYUVToRGB32Row_SSE\n"
+  ".type ScaleYUVToRGB32Row_SSE, @function\n"
+"ScaleYUVToRGB32Row_SSE:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    1f\n"
+
+"0:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"1:"
+  "sub    $0x2,%ecx\n"
+  "jns    0b\n"
+
+  "and    $0x1,%ecx\n"
+  "je     2f\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"2:"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                           width, source_dx);
+    return;
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                       width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx);
+  asm(
+  ".text\n"
+  ".global LinearScaleYUVToRGB32Row_SSE\n"
+  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
+"LinearScaleYUVToRGB32Row_SSE:\n"
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x30(%esp),%ebp\n"
+
+  // source_width = width * source_dx + ebx
+  "mov    0x34(%esp), %ecx\n"
+  "imull  0x38(%esp), %ecx\n"
+  "mov    %ecx, 0x34(%esp)\n"
+
+  "mov    0x38(%esp), %ecx\n"
+  "xor    %ebx,%ebx\n"     // x = 0
+  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
+  "jl     1f\n"
+  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
+  "jmp    1f\n"
+
+"0:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+
+  "movzbl (%edi,%eax,1),%ecx\n"
+  "movzbl 1(%edi,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "andl   $0x1fffe, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0x1fffe, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $17, %ecx \n"
+  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
+
+  "mov    0x2c(%esp),%esi\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+
+  "movzbl (%esi,%eax,1),%ecx\n"
+  "movzbl 1(%esi,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "andl   $0x1fffe, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0x1fffe, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $17, %ecx \n"
+  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%ecx\n"
+  "movzbl 1(%edx,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "andl   $0xffff, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0xffff, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $16, %ecx \n"
+  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
+
+  "cmp    0x34(%esp), %ebx\n"
+  "jge    2f\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%ecx\n"
+  "movzbl 1(%edx,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "andl   $0xffff, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0xffff, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $16, %ecx \n"
+  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
+
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+
+"1:"
+  "cmp    0x34(%esp), %ebx\n"
+  "jl     0b\n"
+  "popa\n"
+  "ret\n"
+
+"2:"
+  "paddsw %mm0, %mm1\n"
+  "psraw $6, %mm1\n"
+  "packuswb %mm1, %mm1\n"
+  "movd %mm1, (%ebp)\n"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
+                                 width, source_dx);
+    return;
+  }
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+                             width, source_dx);
+}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+
+void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 const int16 *kCoefficientsRgbY);
+
+  asm(
+  ".text\n"
+#if defined(XP_MACOSX)
+"_PICConvertYUVToRGB32Row_SSE:\n"
+#else
+"PICConvertYUVToRGB32Row_SSE:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x38(%esp),%ecx\n"
+
+  "jmp    1f\n"
+
+"0:"
+  "movzbl (%edi),%eax\n"
+  "add    $0x1,%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "add    $0x1,%esi\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "add    $0x2,%edx\n"
+  "movq   0(%ecx,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"1:"
+  "subl   $0x2,0x34(%esp)\n"
+  "jns    0b\n"
+
+  "andl   $0x1,0x34(%esp)\n"
+  "je     2f\n"
+
+  "movzbl (%edi),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "movzbl (%esi),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+"2:"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width)
+{
+  if (mozilla::supports_sse()) {
+    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width,
+                               int source_dx,
+                               const int16 *kCoefficientsRgbY);
+
+  asm(
+  ".text\n"
+#if defined(XP_MACOSX)
+"_PICScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICScaleYUVToRGB32Row_SSE:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x3c(%esp),%ecx\n"
+  "xor    %ebx,%ebx\n"
+  "jmp    1f\n"
+
+"0:"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+"1:"
+  "subl   $0x2,0x34(%esp)\n"
+  "jns    0b\n"
+
+  "andl   $0x1,0x34(%esp)\n"
+  "je     2f\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%edi,%eax,1),%eax\n"
+  "movq   2048(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+  "movzbl (%esi,%eax,1),%eax\n"
+  "paddsw 4096(%ecx,%eax,8),%mm0\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%eax\n"
+  "movq   0(%ecx,%eax,8),%mm1\n"
+  "paddsw %mm0,%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm1\n"
+  "movd   %mm1,0x0(%ebp)\n"
+
+"2:"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+                              &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width,
+                                     int source_dx,
+                                     const int16 *kCoefficientsRgbY);
+
+  asm(
+  ".text\n"
+#if defined(XP_MACOSX)
+"_PICLinearScaleYUVToRGB32Row_SSE:\n"
+#else
+"PICLinearScaleYUVToRGB32Row_SSE:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+  "mov    0x3c(%esp),%edi\n"
+  "xor    %ebx,%ebx\n"
+
+  // source_width = width * source_dx + ebx
+  "mov    0x34(%esp), %ecx\n"
+  "imull  0x38(%esp), %ecx\n"
+  "mov    %ecx, 0x34(%esp)\n"
+
+  "mov    0x38(%esp), %ecx\n"
+  "xor    %ebx,%ebx\n"     // x = 0
+  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
+  "jl     1f\n"
+  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
+  "jmp    1f\n"
+
+"0:"
+  "mov    0x28(%esp),%esi\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+
+  "movzbl (%esi,%eax,1),%ecx\n"
+  "movzbl 1(%esi,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "andl   $0x1fffe, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0x1fffe, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $17, %ecx \n"
+  "movq   2048(%edi,%ecx,8),%mm0\n"
+
+  "mov    0x2c(%esp),%esi\n"
+  "mov    %ebx,%eax\n"
+  "sar    $0x11,%eax\n"
+
+  "movzbl (%esi,%eax,1),%ecx\n"
+  "movzbl 1(%esi,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "andl   $0x1fffe, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0x1fffe, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $17, %ecx \n"
+  "paddsw 4096(%edi,%ecx,8),%mm0\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%ecx\n"
+  "movzbl 1(%edx,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "andl   $0xffff, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0xffff, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $16, %ecx \n"
+  "movq   (%edi,%ecx,8),%mm1\n"
+
+  "cmp    0x34(%esp), %ebx\n"
+  "jge    2f\n"
+
+  "mov    %ebx,%eax\n"
+  "sar    $0x10,%eax\n"
+  "movzbl (%edx,%eax,1),%ecx\n"
+  "movzbl 1(%edx,%eax,1),%esi\n"
+  "mov    %ebx,%eax\n"
+  "add    0x38(%esp),%ebx\n"
+  "andl   $0xffff, %eax \n"
+  "imul   %eax, %esi \n"
+  "xorl   $0xffff, %eax \n"
+  "imul   %eax, %ecx \n"
+  "addl   %esi, %ecx \n"
+  "shrl   $16, %ecx \n"
+  "movq   (%edi,%ecx,8),%mm2\n"
+
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "add    $0x8,%ebp\n"
+
+"1:"
+  "cmp    %ebx, 0x34(%esp)\n"
+  "jg     0b\n"
+  "popa\n"
+  "ret\n"
+
+"2:"
+  "paddsw %mm0, %mm1\n"
+  "psraw $6, %mm1\n"
+  "packuswb %mm1, %mm1\n"
+  "movd %mm1, (%ebp)\n"
+  "popa\n"
+  "ret\n"
+#if !defined(XP_MACOSX)
+  ".previous\n"
+#endif
+);
+
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx)
+{
+  if (mozilla::supports_sse()) {
+    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                    source_dx, &kCoefficientsRgbY[0][0]);
+    return;
+  }
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#else
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+#endif
+
+}
diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
new file mode 100644
index 0000000000..c531b60c21
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_table.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+#define RGBY(i) { \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  0 \
+}
+
+#define RGBU(i) { \
+  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+  0, \
+  static_cast<int16>(256 * 64 - 1) \
+}
+
+#define RGBV(i) { \
+  0, \
+  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+  0 \
+}
+
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[256 * 3][4]) = {
+  RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03),
+  RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07),
+  RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B),
+  RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F),
+  RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13),
+  RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17),
+  RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B),
+  RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F),
+  RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23),
+  RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27),
+  RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B),
+  RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F),
+  RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33),
+  RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37),
+  RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B),
+  RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F),
+  RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43),
+  RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47),
+  RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B),
+  RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F),
+  RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53),
+  RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57),
+  RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B),
+  RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F),
+  RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63),
+  RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67),
+  RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B),
+  RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F),
+  RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73),
+  RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77),
+  RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B),
+  RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F),
+  RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83),
+  RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87),
+  RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B),
+  RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F),
+  RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93),
+  RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97),
+  RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B),
+  RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F),
+  RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3),
+  RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7),
+  RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB),
+  RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF),
+  RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3),
+  RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7),
+  RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB),
+  RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF),
+  RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3),
+  RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7),
+  RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB),
+  RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF),
+  RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3),
+  RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7),
+  RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB),
+  RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF),
+  RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3),
+  RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7),
+  RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB),
+  RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF),
+  RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3),
+  RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7),
+  RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB),
+  RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF),
+
+  // Chroma U table.
+  RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03),
+  RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07),
+  RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B),
+  RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F),
+  RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13),
+  RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17),
+  RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B),
+  RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F),
+  RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23),
+  RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27),
+  RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B),
+  RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F),
+  RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33),
+  RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37),
+  RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B),
+  RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F),
+  RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43),
+  RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47),
+  RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B),
+  RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F),
+  RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53),
+  RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57),
+  RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B),
+  RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F),
+  RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63),
+  RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67),
+  RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B),
+  RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F),
+  RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73),
+  RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77),
+  RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B),
+  RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F),
+  RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83),
+  RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87),
+  RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B),
+  RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F),
+  RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93),
+  RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97),
+  RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B),
+  RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F),
+  RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3),
+  RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7),
+  RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB),
+  RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF),
+  RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3),
+  RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7),
+  RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB),
+  RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF),
+  RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3),
+  RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7),
+  RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB),
+  RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF),
+  RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3),
+  RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7),
+  RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB),
+  RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF),
+  RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3),
+  RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7),
+  RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB),
+  RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF),
+  RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3),
+  RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7),
+  RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB),
+  RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF),
+
+  // Chroma V table.
+  RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03),
+  RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07),
+  RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B),
+  RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F),
+  RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13),
+  RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17),
+  RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B),
+  RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F),
+  RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23),
+  RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27),
+  RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B),
+  RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F),
+  RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33),
+  RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37),
+  RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B),
+  RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F),
+  RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43),
+  RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47),
+  RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B),
+  RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F),
+  RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53),
+  RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57),
+  RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B),
+  RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F),
+  RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63),
+  RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67),
+  RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B),
+  RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F),
+  RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73),
+  RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77),
+  RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B),
+  RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F),
+  RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83),
+  RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87),
+  RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B),
+  RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F),
+  RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93),
+  RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97),
+  RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B),
+  RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F),
+  RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3),
+  RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7),
+  RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB),
+  RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF),
+  RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3),
+  RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7),
+  RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB),
+  RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF),
+  RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3),
+  RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7),
+  RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB),
+  RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF),
+  RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3),
+  RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7),
+  RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB),
+  RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF),
+  RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3),
+  RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7),
+  RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB),
+  RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF),
+  RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3),
+  RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7),
+  RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB),
+  RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
+};
+
+#undef RGBY
+#undef RGBU
+#undef RGBV
+
+}  // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
new file mode 100644
index 0000000000..3c77a8ee5b
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win.cpp
@@ -0,0 +1,506 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+#include "mozilla/SSE.h"
+
+#define kCoefficientsRgbU kCoefficientsRgbY + 2048
+#define kCoefficientsRgbV kCoefficientsRgbY + 4096
+
+extern "C" {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+#if defined(__clang__)
+// clang-cl has a bug where it doesn't mangle names in inline asm
+// so let's do the mangling in the preprocessor (ugh)
+// (but we still need to declare a dummy extern for the parser)
+extern void* _kCoefficientsRgbY;
+#define kCoefficientsRgbY _kCoefficientsRgbY
+#endif
+
+__declspec(naked)
+void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       convertend
+
+ convertloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, 1
+    movzx     ebx, byte ptr [esi]
+    add       esi, 1
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
+    movzx     ebx, byte ptr [edx + 1]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    add       edx, 2
+    movq      mm2, [kCoefficientsRgbY + 8 * ebx]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ convertend :
+    sub       ecx, 2
+    jns       convertloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        convertdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ convertdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int step) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    mov       ebx, [esp + 32 + 24]  // step
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int ystep,
+                                    int uvstep) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    mov       ebx, [esp + 32 + 28]  // uvstep
+    add       edi, ebx
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    add       esi, ebx
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    mov       ebx, [esp + 32 + 24]  // ystep
+    add       edx, ebx
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    add       edx, ebx
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ wend :
+    sub       ecx, 2
+    jns       wloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+ wdone :
+
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    jmp       wend
+
+ wloop :
+    movzx     eax, byte ptr [edi]
+    add       edi, 1
+    movzx     ebx, byte ptr [esi]
+    add       esi, 1
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    punpckldq mm1, mm1
+    movntq    [ebp], mm1
+
+    movzx     ebx, byte ptr [edx + 1]
+    add       edx, 2
+    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
+    psraw     mm0, 6
+    packuswb  mm0, mm0
+    punpckldq mm0, mm0
+    movntq    [ebp+8], mm0
+    add       ebp, 16
+ wend :
+    sub       ecx, 4
+    jns       wloop
+
+    add       ecx, 4
+    jz        wdone
+
+    movzx     eax, byte ptr [edi]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    movzx     eax, byte ptr [esi]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    movzx     eax, byte ptr [edx]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    jmp       wend1
+
+ wloop1 :
+    movd      [ebp], mm1
+    add       ebp, 4
+ wend1 :
+    sub       ecx, 1
+    jns       wloop1
+ wdone :
+    popad
+    ret
+  }
+}
+
+// This version does general purpose scaling by any amount, up or down.
+// The only thing it cannot do is rotation by 90 or 270.
+// For performance the chroma is under-sampled, reducing cost of a 3x
+// 1080p scale from 8.4 ms to 5.4 ms.
+__declspec(naked)
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]   // Y
+    mov       edi, [esp + 32 + 8]   // U
+    mov       esi, [esp + 32 + 12]  // V
+    mov       ebp, [esp + 32 + 16]  // rgb
+    mov       ecx, [esp + 32 + 20]  // width
+    xor       ebx, ebx              // x
+    jmp       scaleend
+
+ scaleloop :
+    mov       eax, ebx
+    sar       eax, 17
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 17
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += source_dx
+    sar       eax, 16
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]  // x += source_dx
+    sar       eax, 16
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm2, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 6
+    psraw     mm2, 6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 8
+ scaleend :
+    sub       ecx, 2
+    jns       scaleloop
+
+    and       ecx, 1  // odd number of pixels?
+    jz        scaledone
+
+    mov       eax, ebx
+    sar       eax, 17
+    movzx     eax, byte ptr [edi + eax]
+    movq      mm0, [kCoefficientsRgbU + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 17
+    movzx     eax, byte ptr [esi + eax]
+    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
+    mov       eax, ebx
+    sar       eax, 16
+    movzx     eax, byte ptr [edx + eax]
+    movq      mm1, [kCoefficientsRgbY + 8 * eax]
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+
+ scaledone :
+    popad
+    ret
+  }
+}
+
+__declspec(naked)
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx) {
+  __asm {
+    pushad
+    mov       edx, [esp + 32 + 4]  // Y
+    mov       edi, [esp + 32 + 8]  // U
+                // [esp + 32 + 12] // V
+    mov       ebp, [esp + 32 + 16] // rgb
+    mov       ecx, [esp + 32 + 20] // width
+    imul      ecx, [esp + 32 + 24] // source_dx
+    mov       [esp + 32 + 20], ecx // source_width = width * source_dx
+    mov       ecx, [esp + 32 + 24] // source_dx
+    xor       ebx, ebx             // x = 0
+    cmp       ecx, 0x20000
+    jl        lscaleend
+    mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
+    jmp       lscaleend
+lscaleloop:
+    mov       eax, ebx
+    sar       eax, 0x11
+
+    movzx     ecx, byte ptr [edi + eax]
+    movzx     esi, byte ptr [edi + eax + 1]
+    mov       eax, ebx
+    and       eax, 0x1fffe
+    imul      esi, eax
+    xor       eax, 0x1fffe
+    imul      ecx, eax
+    add       ecx, esi
+    shr       ecx, 17
+    movq      mm0, [kCoefficientsRgbU + 8 * ecx]
+
+    mov       esi, [esp + 32 + 12]
+    mov       eax, ebx
+    sar       eax, 0x11
+
+    movzx     ecx, byte ptr [esi + eax]
+    movzx     esi, byte ptr [esi + eax + 1]
+    mov       eax, ebx
+    and       eax, 0x1fffe
+    imul      esi, eax
+    xor       eax, 0x1fffe
+    imul      ecx, eax
+    add       ecx, esi
+    shr       ecx, 17
+    paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
+
+    mov       eax, ebx
+    sar       eax, 0x10
+    movzx     ecx, byte ptr [edx + eax]
+    movzx     esi, byte ptr [1 + edx + eax]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]
+    and       eax, 0xffff
+    imul      esi, eax
+    xor       eax, 0xffff
+    imul      ecx, eax
+    add       ecx, esi
+    shr       ecx, 16
+    movq      mm1, [kCoefficientsRgbY + 8 * ecx]
+
+    cmp       ebx, [esp + 32 + 20]
+    jge       lscalelastpixel
+
+    mov       eax, ebx
+    sar       eax, 0x10
+    movzx     ecx, byte ptr [edx + eax]
+    movzx     esi, byte ptr [edx + eax + 1]
+    mov       eax, ebx
+    add       ebx, [esp + 32 + 24]
+    and       eax, 0xffff
+    imul      esi, eax
+    xor       eax, 0xffff
+    imul      ecx, eax
+    add       ecx, esi
+    shr       ecx, 16
+    movq      mm2, [kCoefficientsRgbY + 8 * ecx]
+
+    paddsw    mm1, mm0
+    paddsw    mm2, mm0
+    psraw     mm1, 0x6
+    psraw     mm2, 0x6
+    packuswb  mm1, mm2
+    movntq    [ebp], mm1
+    add       ebp, 0x8
+
+lscaleend:
+    cmp       ebx, [esp + 32 + 20]
+    jl        lscaleloop
+    popad
+    ret
+
+lscalelastpixel:
+    paddsw    mm1, mm0
+    psraw     mm1, 6
+    packuswb  mm1, mm1
+    movd      [ebp], mm1
+    popad
+    ret
+  };
+}
+#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
+    return;
+  }
+#endif
+
+  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+    return;
+  }
+#endif
+
+  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
+  if (mozilla::supports_sse()) {
+    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
+                                 source_dx);
+    return;
+  }
+#endif
+
+  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+} // extern "C"
diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
new file mode 100644
index 0000000000..6a34f840a5
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win64.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
+
+#define kCoefficientsRgbU (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 2048)
+#define kCoefficientsRgbV (reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 4096)
+
+#include <emmintrin.h>
+
+static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+
+  while (width >= 2) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
+
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u, v, y;
+  int x = 0;
+
+  while (width >= 2) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width,
+                                          int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u0, u1, v0, v1, y0, y1;
+  uint32 uv_frac, y_frac, u, v, y;
+  int x = 0;
+
+  if (source_dx >= 0x20000) {
+    x = 32768;
+  }
+
+  while(width >= 2) {
+    u0 = u_buf[x >> 17];
+    u1 = u_buf[(x >> 17) + 1];
+    v0 = v_buf[x >> 17];
+    v1 = v_buf[(x >> 17) + 1];
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    uv_frac = (x & 0x1fffe);
+    y_frac = (x & 0xffff);
+    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
+    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    y_frac = (x & 0xffff);
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8*>(kCoefficientsRgbY) + 8 * y));
+
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
+                                source_dx);
+}
+
+} // extern "C"