44 files changed, 53154 insertions, 0 deletions
diff --git a/media/libyuv/libyuv/source/compare.cc b/media/libyuv/libyuv/source/compare.cc
new file mode 100644
index 0000000000..50e3abd055
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare.cc
@@ -0,0 +1,429 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
+      HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64_t)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)count & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)count & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height) {
+  uint32_t fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  const int kSimdSize = 64;
+  // SIMD for multiple of 64, and C for remainder
+  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+  uint64_t diff = 0;
+  int i;
+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+                              int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HammingDistance = HammingDistance_NEON;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    HammingDistance = HammingDistance_SSSE3;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+  if (TestCpuFlag(kCpuHasSSE42)) {
+    HammingDistance = HammingDistance_SSE42;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HammingDistance = HammingDistance_AVX2;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HammingDistance = HammingDistance_MSA;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    diff += HammingDistance(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & (kSimdSize - 1);
+  if (remainder) {
+    diff += HammingDistance_C(src_a, src_b, remainder);
+  }
+  return diff;
+}
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32_t.
+  // After each block of 65536 pixels, accumulate into a uint64_t.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64_t sse = 0;
+  int i;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SumSquareError = SumSquareError_MSA;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height) {
+  uint64_t sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width && stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)count / (double)sse;
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;  // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr) {
+    psnr = kMaxPsnr;
+  }
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
+  const uint64_t samples = (uint64_t)width * (uint64_t)height;
+  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+                                                  stride_b, width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const uint64_t sse_y = ComputeSumSquareErrorPlane(
+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64_t sse_u = ComputeSumSquareErrorPlane(
+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+  const uint64_t sse_v = ComputeSumSquareErrorPlane(
+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+  const uint64_t samples = (uint64_t)width * (uint64_t)height +
+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+  const uint64_t sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8_t* src_a,
+                        int stride_a,
+                        const uint8_t* src_b,
+                        int stride_b) {
+  int64_t sum_a = 0;
+  int64_t sum_b = 0;
+  int64_t sum_sq_a = 0;
+  int64_t sum_sq_b = 0;
+  int64_t sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64_t count = 64;
+    // scale the constants by number of pixels
+    const int64_t c1 = (cc1 * count * count) >> 12;
+    const int64_t c2 = (cc2 * count * count) >> 12;
+
+    const int64_t sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64_t sum_a_sq = sum_a * sum_a;
+    const int64_t sum_b_sq = sum_b * sum_b;
+
+    const int64_t ssim_d =
+        (sum_a_sq + sum_b_sq + c1) *
+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
+                    int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const double ssim_y =
+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/compare_common.cc b/media/libyuv/libyuv/source/compare_common.cc
new file mode 100644
index 0000000000..633466addb
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_common.cc
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count; ++i) {
+    int x = src_a[i] ^ src_b[i];
+    if (x & 1)
+      ++diff;
+    if (x & 2)
+      ++diff;
+    if (x & 4)
+      ++diff;
+    if (x & 8)
+      ++diff;
+    if (x & 16)
+      ++diff;
+    if (x & 32)
+      ++diff;
+    if (x & 64)
+      ++diff;
+    if (x & 128)
+      ++diff;
+  }
+  return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
+    uint32_t u = x - ((x >> 1) & 0x55555555);
+    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+    src_a += 4;
+    src_b += 4;
+  }
+
+  for (; i < count; ++i) {
+    uint32_t x = *src_a ^ *src_b;
+    uint32_t u = x - ((x >> 1) & 0x55);
+    u = ((u >> 2) & 0x33) + (u & 0x33);
+    diff += (u + (u >> 4)) & 0x0f;
+    src_a += 1;
+    src_b += 1;
+  }
+
+  return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count) {
+  uint32_t sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32_t)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/compare_gcc.cc b/media/libyuv/libyuv/source/compare_gcc.cc
new file mode 100644
index 0000000000..676527c1b1
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_gcc.cc
@@ -0,0 +1,360 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint64_t diff = 0u;
+
+  asm volatile(
+      "xor        %3,%3                          \n"
+      "xor        %%r8,%%r8                      \n"
+      "xor        %%r9,%%r9                      \n"
+      "xor        %%r10,%%r10                    \n"
+
+      // Process 32 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%rcx                     \n"
+      "mov        0x8(%0),%%rdx                  \n"
+      "xor        (%1),%%rcx                     \n"
+      "xor        0x8(%1),%%rdx                  \n"
+      "popcnt     %%rcx,%%rcx                    \n"
+      "popcnt     %%rdx,%%rdx                    \n"
+      "mov        0x10(%0),%%rsi                 \n"
+      "mov        0x18(%0),%%rdi                 \n"
+      "xor        0x10(%1),%%rsi                 \n"
+      "xor        0x18(%1),%%rdi                 \n"
+      "popcnt     %%rsi,%%rsi                    \n"
+      "popcnt     %%rdi,%%rdi                    \n"
+      "add        $0x20,%0                       \n"
+      "add        $0x20,%1                       \n"
+      "add        %%rcx,%3                       \n"
+      "add        %%rdx,%%r8                     \n"
+      "add        %%rsi,%%r9                     \n"
+      "add        %%rdi,%%r10                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "add        %%r8, %3                       \n"
+      "add        %%r9, %3                       \n"
+      "add        %%r10, %3                      \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=r"(diff)    // %3
+      :
+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+  return static_cast<uint32_t>(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      // Process 16 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%ecx                     \n"
+      "mov        0x4(%0),%%edx                  \n"
+      "xor        (%1),%%ecx                     \n"
+      "xor        0x4(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "mov        0x8(%0),%%ecx                  \n"
+      "mov        0xc(%0),%%edx                  \n"
+      "xor        0x8(%1),%%ecx                  \n"
+      "xor        0xc(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "add        $0x10,%0                       \n"
+      "add        $0x10,%1                       \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "+r"(diff)    // %3
+      :
+      : "memory", "cc", "ecx", "edx");
+
+  return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+                                 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "movdqa     %4,%%xmm2                      \n"
+      "movdqa     %5,%%xmm3                      \n"
+      "pxor       %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm1,%%xmm1                  \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa     (%0),%%xmm4                    \n"
+      "movdqa     0x10(%0), %%xmm5               \n"
+      "pxor       (%0,%1), %%xmm4                \n"
+      "movdqa     %%xmm4,%%xmm6                  \n"
+      "pand       %%xmm2,%%xmm6                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm6,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm6                  \n"
+      "pshufb     %%xmm4,%%xmm6                  \n"
+      "paddb      %%xmm7,%%xmm6                  \n"
+      "pxor       0x10(%0,%1),%%xmm5             \n"
+      "add        $0x20,%0                       \n"
+      "movdqa     %%xmm5,%%xmm4                  \n"
+      "pand       %%xmm2,%%xmm5                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm5,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm5                  \n"
+      "pshufb     %%xmm4,%%xmm5                  \n"
+      "paddb      %%xmm7,%%xmm5                  \n"
+      "paddb      %%xmm5,%%xmm6                  \n"
+      "psadbw     %%xmm1,%%xmm6                  \n"
+      "paddd      %%xmm6,%%xmm0                  \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
+      "paddd      %%xmm1,%%xmm0                  \n"
+      "movd       %%xmm0, %3                     \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+
+  return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "vbroadcastf128 %4,%%ymm2                  \n"
+      "vbroadcastf128 %5,%%ymm3                  \n"
+      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqa    (%0),%%ymm4                    \n"
+      "vmovdqa    0x20(%0), %%ymm5               \n"
+      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
+      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
+      "add        $0x40,%0                       \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
+      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
+      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
+      "sub        $0x40,%2                       \n"
+      "jg         1b                             \n"
+
+      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovd      %%xmm0, %3                     \n"
+      "vzeroupper                                \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+  return diff;
+}
+#endif  // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm1,%%xmm3                   \n"
+      "psubusb   %%xmm2,%%xmm1                   \n"
+      "psubusb   %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpckhbw %%xmm5,%%xmm2                   \n"
+      "pmaddwd   %%xmm1,%%xmm1                   \n"
+      "pmaddwd   %%xmm2,%%xmm2                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+
+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,%3                       \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+  return sse;
+}
+
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
+};
+
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash;
+  asm volatile(
+      "movd      %2,%%xmm0                       \n"
+      "pxor      %%xmm7,%%xmm7                   \n"
+      "movdqa    %4,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmulld    %%xmm6,%%xmm0                   \n"
+      "movdqa    %5,%%xmm5                       \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm7,%%xmm3                   \n"
+      "pmulld    %%xmm5,%%xmm3                   \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpckhwd %%xmm7,%%xmm4                   \n"
+      "pmulld    %%xmm5,%%xmm4                   \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "punpckhbw %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm7,%%xmm2                   \n"
+      "pmulld    %%xmm5,%%xmm2                   \n"
+      "movdqa    %8,%%xmm5                       \n"
+      "punpckhwd %%xmm7,%%xmm1                   \n"
+      "pmulld    %%xmm5,%%xmm1                   \n"
+      "paddd     %%xmm4,%%xmm3                   \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm1                   \n"
+      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "sub       $0x10,%1                        \n"
+      "jg        1b                              \n"
+      "movd      %%xmm0,%3                       \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/compare_msa.cc b/media/libyuv/libyuv/source/compare_msa.cc
new file mode 100644
index 0000000000..e944235f02
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_msa.cc
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t diff = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v2i64 vec0 = {0}, vec1 = {0};
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    src0 ^= src2;
+    src1 ^= src3;
+    vec0 += __msa_pcnt_d((v2i64)src0);
+    vec1 += __msa_pcnt_d((v2i64)src1);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  vec0 += vec1;
+  diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
+  diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
+  return diff;
+}
+
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t sse = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2, vec3;
+  v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
+  v2i64 tmp0;
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
+    reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
+    reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
+    reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  reg0 += reg1;
+  reg2 += reg3;
+  reg0 += reg2;
+  tmp0 = __msa_hadd_s_d(reg0, reg0);
+  sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
+  sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
+  return sse;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libyuv/libyuv/source/compare_neon.cc b/media/libyuv/libyuv/source/compare_neon.cc
new file mode 100644
index 0000000000..2a2181e0cb
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_neon.cc
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+
+  asm volatile(
+      "vmov.u16   q4, #0                         \n"  // accumulator
+
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"
+      "vld1.8     {q2, q3}, [%1]!                \n"
+      "veor.32    q0, q0, q2                     \n"
+      "veor.32    q1, q1, q3                     \n"
+      "vcnt.i8    q0, q0                         \n"
+      "vcnt.i8    q1, q1                         \n"
+      "subs       %2, %2, #32                    \n"
+      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
+      "vpadal.u8  q4, q0                         \n"  // 8 shorts
+      "bgt        1b                             \n"
+
+      "vpaddl.u16 q0, q4                         \n"  // 4 ints
+      "vpadd.u32  d0, d0, d1                     \n"
+      "vpadd.u32  d0, d0, d0                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "vmov.u8    q8, #0                         \n"
+      "vmov.u8    q10, #0                        \n"
+      "vmov.u8    q9, #0                         \n"
+      "vmov.u8    q11, #0                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"
+      "vld1.8     {q1}, [%1]!                    \n"
+      "subs       %2, %2, #16                    \n"
+      "vsubl.u8   q2, d0, d2                     \n"
+      "vsubl.u8   q3, d1, d3                     \n"
+      "vmlal.s16  q8, d4, d4                     \n"
+      "vmlal.s16  q9, d6, d6                     \n"
+      "vmlal.s16  q10, d5, d5                    \n"
+      "vmlal.s16  q11, d7, d7                    \n"
+      "bgt        1b                             \n"
+
+      "vadd.u32   q8, q8, q9                     \n"
+      "vadd.u32   q10, q10, q11                  \n"
+      "vadd.u32   q11, q8, q10                   \n"
+      "vpaddl.u32 q1, q11                        \n"
+      "vadd.u64   d0, d2, d3                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/compare_neon64.cc b/media/libyuv/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000000..6e8f672ab7
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_neon64.cc
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+  asm volatile(
+      "movi       v4.8h, #0                      \n"
+
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
+      "eor        v0.16b, v0.16b, v2.16b         \n"
+      "eor        v1.16b, v1.16b, v3.16b         \n"
+      "cnt        v0.16b, v0.16b                 \n"
+      "cnt        v1.16b, v1.16b                 \n"
+      "subs       %w2, %w2, #32                  \n"
+      "add        v0.16b, v0.16b, v1.16b         \n"
+      "uadalp     v4.8h, v0.16b                  \n"
+      "b.gt       1b                             \n"
+
+      "uaddlv     s4, v4.8h                      \n"
+      "fmov       %w3, s4                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "eor        v16.16b, v16.16b, v16.16b      \n"
+      "eor        v18.16b, v18.16b, v18.16b      \n"
+      "eor        v17.16b, v17.16b, v17.16b      \n"
+      "eor        v19.16b, v19.16b, v19.16b      \n"
+
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"
+      "ld1        {v1.16b}, [%1], #16            \n"
+      "subs       %w2, %w2, #16                  \n"
+      "usubl      v2.8h, v0.8b, v1.8b            \n"
+      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "smlal      v16.4s, v2.4h, v2.4h           \n"
+      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "smlal2     v18.4s, v2.8h, v2.8h           \n"
+      "smlal2     v19.4s, v3.8h, v3.8h           \n"
+      "b.gt       1b                             \n"
+
+      "add        v16.4s, v16.4s, v17.4s         \n"
+      "add        v18.4s, v18.4s, v19.4s         \n"
+      "add        v19.4s, v16.4s, v18.4s         \n"
+      "addv       s0, v19.4s                     \n"
+      "fmov       %w3, s0                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/compare_win.cc b/media/libyuv/libyuv/source/compare_win.cc
new file mode 100644
index 0000000000..d57d3d9d1c
--- /dev/null
+++ b/media/libyuv/libyuv/source/compare_win.cc
@@ -0,0 +1,241 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __popcnt
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
+    src_a += 4;
+    src_b += 4;
+    diff += __popcnt(x);
+  }
+  return diff;
+}
+
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable : 4752)
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
+};
+
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7  // constant 0 for unpck
+    movdqa     xmm6, xmmword ptr kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]  // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    movdqa     xmm5, xmmword ptr kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7  // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7  // src[0-3]
+    pmulld     xmm3, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7  // src[4-7]
+    pmulld     xmm4, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul2
+    punpckhbw  xmm1, xmm7  // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7  // src[8-11]
+    pmulld     xmm2, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul3
+    punpckhwd  xmm1, xmm7  // src[12-15]
+    pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4  // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0  // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
+    vmovd      xmm0, [esp + 12]  // seed
+
+  wloop:
+    vpmovzxbd  xmm3, [eax]  // src[0-3]
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
+    lea        eax, [eax + 16]
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
+    vpaddd     xmm3, xmm3, xmm4  // add 16 results
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm3
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpaddd     xmm1, xmm1,xmm2
+    vpshufd    xmm2, xmm1, 0x01
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm0, xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    vmovd      eax, xmm0  // return hash
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert.cc b/media/libyuv/libyuv/source/convert.cc
new file mode 100644
index 0000000000..375cc732c1
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert.cc
@@ -0,0 +1,1740 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping.
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+                    halfheight);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+                    halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8_t* src,
+                       int src_stride_0,
+                       int src_stride_1,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
+  int y;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8_t* src_y,
+                      int src_stride_y0,
+                      int src_stride_y1,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    if (dst_y) {
+      dst_y = dst_y + (height - 1) * dst_stride_y;
+    }
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  // Split UV plane - NV12 / NV21
+  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+               halfwidth, halfheight);
+
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                    dst_stride_v, width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+                    dst_stride_u, width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
+      BGRAToYRow_C;
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
+      RGBAToYRow_C;
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+#elif defined(HAS_RGB24TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+                     uint8_t* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+#elif defined(HAS_RAWTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
+  int y;
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+#elif defined(HAS_RGB565TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#endif
+  {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
+  int y;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#endif
+  {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
+#endif
+
+  {
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+static void SplitPixels(const uint8_t* src_u,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_u,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst_u = *src_u;
+    ++dst_u;
+    src_u += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
+  int y;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Copy UV planes as is - I420
+  if (src_pixel_stride_uv == 1) {
+    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+                 halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+    return 0;
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+    SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_argb.cc b/media/libyuv/libyuv/source/convert_argb.cc
new file mode 100644
index 0000000000..967f3d1cbd
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_argb.cc
@@ -0,0 +1,2267 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
+  return 0;
+}
+
+// Convert I420 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I422 to ARGB with matrix
+static int I422ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+static int I010ToAR30Matrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+static int I010ToARGBMatrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB.
+static int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                                 int src_stride_y,
+                                 const uint8_t* src_u,
+                                 int src_stride_u,
+                                 const uint8_t* src_v,
+                                 int src_stride_v,
+                                 const uint8_t* src_a,
+                                 int src_stride_a,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width,
+                                 int height,
+                                 int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_argb = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToARGBRow_C(src_ar30, dst_argb, width);
+    src_ar30 += src_stride_ar30;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_abgr = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+    src_ar30 += src_stride_ar30;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_ab30 = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+    src_ar30 += src_stride_ar30;
+    dst_ab30 += dst_stride_ab30;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB with matrix
+static int NV12ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_uv,
+                            int src_stride_uv,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB with matrix
+static int NV21ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_vu,
+                            int src_stride_vu,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix
+static int NV12ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_uv,
+                             int src_stride_uv,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV12ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB24 with matrix
+static int NV21ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_vu,
+                             int src_stride_vu,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV21ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// TODO(fbarchard): \(fbarchard): NV12ToRAW can be implemented by mirrored
+// matrix. Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+static void WeavePixels(const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_uv,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_uv[0] = *src_u;
+    dst_uv[1] = *src_v;
+    dst_uv += 2;
+    src_u += src_pixel_stride_uv;
+    src_v += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height) {
+  int y;
+  uint8_t* dst_uv;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  // I420
+  if (src_pixel_stride_uv == 1) {
+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_argb, dst_stride_argb,
+                            yuvconstants, width, height);
+    // NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+    // NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+  }
+
+  // General case fallback creates NV12
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  dst_uv = plane_uv;
+  for (y = 0; y < halfheight; ++y) {
+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += halfwidth * 2;
+  }
+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+                   dst_stride_argb, yuvconstants, width, height);
+  free_aligned_buffer_64(plane_uv);
+  return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_argb,
+                                dst_stride_argb, &kYuvI601Constants, width,
+                                height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                                src_stride_u, src_pixel_stride_uv, dst_abgr,
+                                dst_stride_abgr, &kYvuI601Constants, width,
+                                height);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_from.cc b/media/libyuv/libyuv/source/convert_from.cc
new file mode 100644
index 0000000000..b5587ced62
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_from.cc
@@ -0,0 +1,1429 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
+    return -1;
+  }
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    halfheight);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+// TODO(fbarchard): test negative height for invert.
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  int halfwidth = (width + 1) / 2;
+  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+               halfwidth, halfheight);
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I422 to RGBA with matrix
+static int I420ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix
+static int I420ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_u,
+                             int src_stride_u,
+                             const uint8_t* src_v,
+                             int src_stride_v,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32_t*)(dither4x4 + ((y & 3) << 2)),  // NOLINT
+                            width);                                    // NOLINT
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to AR30 with matrix
+static int I420ToAR30Matrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_AR30:
+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
+      break;
+    case FOURCC_NV12: {
+      uint8_t* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8_t* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
+      }
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
+      }
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
+      }
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
+      break;
+    }
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_from_argb.cc b/media/libyuv/libyuv/source/convert_from_argb.cc
new file mode 100644
index 0000000000..16b838458f
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1593 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u == width && dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32_t*)(dither4x4 + ((y & 3) << 2)),  // NOLINT
+                          width); /* NOLINT */
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                          int width) = ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+      ABGRToAR30Row_C;
+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToAR30Row = ABGRToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ABGRToAR30Row(src_abgr, dst_ar30, width);
+    src_abgr += src_stride_abgr;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToAR30Row_C;
+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToAR30Row(src_argb, dst_ar30, width);
+    src_argb += src_stride_argb;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_jpeg.cc b/media/libyuv/libyuv/source/convert_jpeg.cc
new file mode 100644
index 0000000000..ae3cc18cd2
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,332 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8_t* y;
+  int y_stride;
+  uint8_t* u;
+  int u_stride;
+  uint8_t* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8_t* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+           dest->v_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,
+                        dst_v, dst_stride_v, dst_width, dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice.
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8_t* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice.
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_to_argb.cc b/media/libyuv/libyuv/source/convert_to_argb.cc
new file mode 100644
index 0000000000..677e5d56fc
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,270 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8_t* src;
+  const uint8_t* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+  // and then rotate the ARGB to the final destination buffer.
+  // For in-place conversion, if destination dst_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+  uint8_t* dest_argb = dst_argb;
+  int dest_dst_stride_argb = dst_stride_argb;
+  uint8_t* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * 4 * abs_crop_height;
+    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    dst_argb = rotate_buffer;
+    dst_stride_argb = crop_width * 4;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                      inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                    inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      if (!need_buf && !rotation) {
+        src = sample + (src_width * crop_y + crop_x) * 4;
+        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+                       crop_width, inv_crop_height);
+      }
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+              (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  } else if (rotation) {
+    src = sample + (src_width * crop_y + crop_x) * 4;
+    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                   inv_crop_height, rotation);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/convert_to_i420.cc b/media/libyuv/libyuv/source/convert_to_i420.cc
new file mode 100644
index 0000000000..1bed9d6440
--- /dev/null
+++ b/media/libyuv/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8_t* src;
+  const uint8_t* src_uv;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  // TODO(nisse): Why allow crop_height < 0?
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+       format != FOURCC_NV21 && format != FOURCC_YV12) ||
+      dst_y == sample;
+  uint8_t* tmp_y = dst_y;
+  uint8_t* tmp_u = dst_u;
+  uint8_t* tmp_v = dst_v;
+  int tmp_y_stride = dst_stride_y;
+  int tmp_u_stride = dst_stride_u;
+  int tmp_v_stride = dst_stride_v;
+  uint8_t* rotate_buffer = NULL;
+  const int inv_crop_height =
+      (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      crop_width <= 0 || src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination dst_y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    dst_y = rotate_buffer;
+    dst_u = dst_y + y_size;
+    dst_v = dst_u + uv_size;
+    dst_stride_y = crop_width;
+    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                       dst_stride_u, dst_v, dst_stride_v, crop_width,
+                       inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                      dst_stride_u, dst_v, dst_stride_v, crop_width,
+                      inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                    dst_stride_u, dst_v, dst_stride_v, crop_width,
+                    inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_u, dst_stride_u, dst_v,
+                           dst_stride_v, crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with dst_u and dst_v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_v, dst_stride_v, dst_u,
+                           dst_stride_u, crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+                     rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/cpu_id.cc b/media/libyuv/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000000..446aad1207
--- /dev/null
+++ b/media/libyuv/libyuv/source/cpu_id.cc
@@ -0,0 +1,349 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h>  // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+    !defined(__clang__)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  __cpuidex(cpu_info, info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else  // Visual C but not x86
+  if (info_ecx == 0) {
+    __cpuid(cpu_info, info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // defined(_MSC_VER)
+  int info_ebx, info_edx;
+  asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+      // Preserve ebx for fpic 32 bit.
+      "mov %%ebx, %%edi                          \n"
+      "cpuid                                     \n"
+      "xchg %%edi, %%ebx                         \n"
+      : "=D"(info_ebx),
+#else
+      "cpuid                                     \n"
+      : "=b"(info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // defined(_MSC_VER)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(int eax, int ecx, int* cpu_info) {
+  (void)eax;
+  (void)ecx;
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+  int xcr0 = 0;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return xcr0;
+}
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+// TODO(fbarchard): Consider read_msa_ir().
+// TODO(fbarchard): Add unittest.
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+                                       const char ase[]) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // ase enabled if /proc/cpuinfo is unavailable.
+    if (strcmp(ase, " msa") == 0) {
+      return kCpuHasMSA;
+    }
+    return 0;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      char* p = strstr(cpuinfo_line, ase);
+      if (p) {
+        fclose(f);
+        if (strcmp(ase, " msa") == 0) {
+          return kCpuHasMSA;
+        }
+        return 0;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
+}
+#endif
+
+static SAFEBUFFERS int GetCpuFlags(void) {
+  int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+     defined(_M_IX86))
+  int cpu_info0[4] = {0, 0, 0, 0};
+  int cpu_info1[4] = {0, 0, 0, 0};
+  int cpu_info7[4] = {0, 0, 0, 0};
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
+
+  // AVX requires OS saves YMM registers.
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+
+    // Detect AVX512bw
+    if ((GetXCR0() & 0xe0) == 0xe0) {
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+    }
+  }
+
+  // TODO(fbarchard): Consider moving these to gtest
+  // Environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info &= ~kCpuHasFMA3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_F16C")) {
+    cpu_info &= ~kCpuHasF16C;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX512BW")) {
+    cpu_info &= ~kCpuHasAVX512BW;
+  }
+
+#endif
+#if defined(__mips__) && defined(__linux__)
+#if defined(__mips_msa)
+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
+#endif
+  cpu_info |= kCpuHasMIPS;
+  if (getenv("LIBYUV_DISABLE_MSA")) {
+    cpu_info &= ~kCpuHasMSA;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info |= kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info &= ~kCpuHasNEON;
+  }
+#endif  // __arm__
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info = 0;
+  }
+  cpu_info |= kCpuInitialized;
+  return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags) {
+  int cpu_info = GetCpuFlags() & enable_flags;
+#ifdef __ATOMIC_RELAXED
+  __atomic_store_n(&cpu_info_, cpu_info, __ATOMIC_RELAXED);
+#else
+  cpu_info_ = cpu_info;
+#endif
+  return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+  return MaskCpuFlags(-1);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/mjpeg_decoder.cc b/media/libyuv/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 0000000000..38d528dd05
--- /dev/null
+++ b/media/libyuv/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,576 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable : 4324)
+#endif
+
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#ifndef DEBUG_MJPEG
+  error_mgr_->base.output_message = &OutputHandler;
+#endif
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8_t*[scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8_t[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
+                                          int dst_width,
+                                          int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy =
+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+                  GetComponentWidth(i), GetComponentWidth(i),
+                  scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+       lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+                                           void* opaque,
+                                           int dst_width,
+                                           int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+       lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  (void)cinfo;  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+// ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+
+#ifndef DEBUG_MJPEG
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+  (void)cinfo;
+}
+#endif
+#endif  // HAVE_SETJMP
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8_t**[num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8_t*[num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete[] scanlines_[i];
+    delete[] databuf_[i];
+  }
+  delete[] scanlines_;
+  delete[] databuf_;
+  delete[] scanlines_sizes_;
+  delete[] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8_t* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+         jpeg_read_raw_data(decompress_struct_, scanlines_,
+                            GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x,
+    int* subsample_y,
+    int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
diff --git a/media/libyuv/libyuv/source/mjpeg_validate.cc b/media/libyuv/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 0000000000..80c2cc0cb9
--- /dev/null
+++ b/media/libyuv/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
+  if (sample_size >= 2) {
+    const uint8_t* end = sample + sample_size - 1;
+    const uint8_t* it = sample;
+    while (it < end) {
+      // TODO(fbarchard): scan for 0xd9 instead.
+      it = (const uint8_t*)(memchr(it, 0xff, end - it));
+      if (it == NULL) {
+        break;
+      }
+      if (it[1] == 0xd9) {
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      ++it;  // Skip over current 0xff.
+    }
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {
+  // Maximum size that ValidateJpeg will consider valid.
+  const size_t kMaxJpegSize = 0x7fffffffull;
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+
+  // Look for the End Of Image (EOI) marker near the end of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  // Step over SOI marker and scan for EOI.
+  return ScanEOI(sample + 2, sample_size - 2);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/planar_functions.cc b/media/libyuv/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000000..5eae3f763a
--- /dev/null
+++ b/media/libyuv/libyuv/source/planar_functions.cc
@@ -0,0 +1,3587 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // for ScaleRowDown2
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  int y;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height) {
+  int y;
+  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+                          int width) = Convert16To8Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert16To8Row = Convert16To8Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert16To8Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+                          int width) = Convert8To16Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Convert8To16Row = Convert8To16Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      Convert8To16Row = Convert8To16Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To16Row = Convert8To16Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To16Row = Convert8To16Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To16Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  (void)src_u;
+  (void)src_stride_u;
+  (void)src_v;
+  (void)src_stride_v;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height) {
+  int y;
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
+  int y;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_u == width && src_stride_v == width &&
+      dst_stride_uv == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow(src_u, src_v, dst_uv, width);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+}
+
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height) {
+  int y;
+  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                      uint8_t* dst_b, int width) = SplitRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitRGBRow = SplitRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitRGBRow = SplitRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of RGB.
+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_rgb += src_stride_rgb;
+  }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height) {
+  int y;
+  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+      MergeRGBRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_rgb == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+  }
+#if defined(HAS_MERGERGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MergeRGBRow = MergeRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeRGBRow = MergeRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of RGB.
+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_rgb += dst_stride_rgb;
+  }
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+      ARGBMirrorRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
+#endif
+  return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = GetARGBBlend();
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  int y;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Coalesce rows for Y plane.
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      alpha_stride == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+  }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+    src_y0 += src_stride_y0;
+    src_y1 += src_stride_y1;
+    alpha += alpha_stride;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+  // Half width/height for UV.
+  int halfwidth = (width + 1) >> 1;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Blend Y plane.
+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+             dst_y, dst_stride_y, width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+  if (!IS_ALIGNED(width, 2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+  }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+      if (IS_ALIGNED(halfwidth, 32)) {
+        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+      }
+    }
+  }
+#endif
+
+  // Row buffer for intermediate alpha pixels.
+  align_buffer_64(halfalpha, halfwidth);
+  for (y = 0; y < height; y += 2) {
+    // last row of odd height image use 1 row of alpha instead of 2.
+    if (y == (height - 1)) {
+      alpha_stride = 0;
+    }
+    // Subsample 2 rows of UV to half width and half height.
+    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+    alpha += alpha_stride * 2;
+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+    src_u0 += src_stride_u0;
+    src_u1 += src_stride_u1;
+    dst_u += dst_stride_u;
+    src_v0 += src_stride_v0;
+    src_v1 += src_stride_v1;
+    dst_v += dst_stride_v;
+  }
+  free_aligned_buffer_64(halfalpha);
+  return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+    }
+  }
+#endif
+
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAddRow = ARGBAddRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAddRow = ARGBAddRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_MSA;
+    }
+  }
+#endif
+
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_MSA;
+    }
+  }
+#endif
+
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+// Convert I422 to RGBA with matrix
+static int I422ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert RAW to RGB24.
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height) {
+  int y;
+  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
+      RAWToRGB24Row_C;
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGB24Row = RAWToRGB24Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToRGB24Row(src_raw, dst_rgb24, width);
+    src_raw += src_stride_raw;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value) {
+  int y;
+  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
+  }
+#endif
+#if defined(HAS_SETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+    SetRow = SetRow_MSA;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8_t* start_y = dst_y + y * dst_stride_y + x;
+  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value) {
+  int y;
+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+      ARGBSetRow_C;
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBSetRow = ARGBSetRow_X86;
+  }
+#endif
+#if defined(HAS_ARGBSETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSetRow = ARGBSetRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MSA;
+    }
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
+  }
+#endif
+  // TODO(fbarchard): Neon version.
+
+  for (y = 0; y < height; ++y) {
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_MSA;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                             const int8_t* matrix_argb, int width) =
+      ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
+  SIMD_ALIGNED(int8_t matrix_argb[16]);
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
+                         dst_stride_argb, &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  int32_t* previous_cumsum = dst_cumsum;
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  for (y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(
+      const int32_t* topleft, const int32_t* botleft, int width, int area,
+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+  int32_t* cumsum_bot_row;
+  int32_t* max_cumsum_bot_row;
+  int32_t* cumsum_top_row;
+
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+                           dst_stride32_cumsum, width, radius);
+
+  src_argb = src_argb + radius * src_stride_argb;
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
+
+  for (y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                                &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                              &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+                                area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+                       uint32_t value) = ARGBShadeRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_MSA;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst = dst + (height - 1) * dst_stride;
+    dst_stride = -dst_stride;
+  }
+  // Coalesce rows.
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+    width *= height;
+    height = 1;
+    src_stride0 = src_stride1 = dst_stride = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+    src0 += src_stride0;
+    src1 += src_stride1;
+    dst += dst_stride;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+                          src_stride_argb1, dst_argb, dst_stride_argb,
+                          width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+                   dst_stride_y, width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+                   dst_stride_u, halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+                   dst_stride_v, halfwidth, halfheight, interpolation);
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height,
+                        void (*SobelRow)(const uint8_t* src_sobelx,
+                                         const uint8_t* src_sobely,
+                                         uint8_t* dst,
+                                         int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    uint8_t* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelYRow = SobelYRow_MSA;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXRow = SobelXRow_MSA;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8_t* row_sobelx = rows;
+    uint8_t* row_sobely = rows + kRowSize;
+    uint8_t* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8_t* row_y0 = row_y + kEdge;
+    uint8_t* row_y1 = row_y0 + kRowSize;
+    uint8_t* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8_t* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                   uint8_t* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelRow = SobelRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height) {
+  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                          uint8_t* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+                      height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                     uint8_t* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXYRow = SobelXYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const float* poly,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                            const float* poly, int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+                       int width) = HalfFloatRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    HalfFloatRow = HalfFloatRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HalfFloatRow = HalfFloatRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      HalfFloatRow = HalfFloatRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+                         int width) = ByteToFloatRow_C;
+  if (!src_y || !dst_y || width <= 0) {
+    return -1;
+  }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ByteToFloatRow = ByteToFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ByteToFloatRow = ByteToFloatRow_NEON;
+    }
+  }
+#endif
+
+  ByteToFloatRow(src_y, dst_y, scale, width);
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(
+      const uint8_t* src_argb, uint8_t* dst_argb, int width,
+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height) {
+  if (!src_argb || !dst_a || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb += (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_a == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_a = 0;
+  }
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    src_argb += src_stride_argb;
+    dst_a += dst_stride_a;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+                              int width) = ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate.cc b/media/libyuv/libyuv/source/rotate.cc
new file mode 100644
index 0000000000..f2bed85b75
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate.cc
@@ -0,0 +1,514 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  int i = height;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
+    }
+  }
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+#endif
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
+  int i = height;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx8 = TransposeUVWx8_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
+  }
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
+    i -= 8;
+  }
+#endif
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
+  int i;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                      int width) = MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorUVRow = MirrorUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorUVRow = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+    MirrorUVRow = MirrorUVRow_MSA;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorUVRow(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                 dst_stride_v, halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_any.cc b/media/libyuv/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000000..c2752e6222
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_any.cc
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_argb.cc b/media/libyuv/libyuv/source/rotate_argb.cc
new file mode 100644
index 0000000000..5a6e05376f
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_argb.cc
@@ -0,0 +1,224 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static void ARGBTranspose(const uint8_t* src_argb,
+                          int src_stride_argb,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          int width,
+                          int height) {
+  int i;
+  int src_pixel_step = src_stride_argb >> 2;
+  void (*ScaleARGBRowDownEven)(
+      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+    }
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+    dst_argb += dst_stride_argb;
+    src_argb += 4;
+  }
+}
+
+void ARGBRotate90(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src_argb += src_stride_argb * (height - 1);
+  src_stride_argb = -src_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
+}
+
+void ARGBRotate270(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst_argb += dst_stride_argb * (width - 1);
+  dst_stride_argb = -dst_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
+}
+
+void ARGBRotate180(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+    src_bot -= src_stride_argb;
+    dst_bot -= dst_stride_argb;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                   height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_common.cc b/media/libyuv/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000000..ff212adebc
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_common.cc
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_gcc.cc b/media/libyuv/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000000..04e19e29ee
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,374 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movq       (%0),%%xmm0                      \n"
+      "movq       (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "movq       (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "movq       (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movq       (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "movq       (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movq       (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "lea        0x8(%0,%3,8),%0                  \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "sub        $0x8,%2                          \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm9                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "palignr    $0x8,%%xmm9,%%xmm9               \n"
+      "movdqu     (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm10                   \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm10                   \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movdqa     %%xmm10,%%xmm11                  \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "movdqu     (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm12                   \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm12                   \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movdqa     %%xmm12,%%xmm13                  \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movdqu     (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm14                   \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "punpckhbw  %%xmm7,%%xmm14                   \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "movdqa     %%xmm14,%%xmm15                  \n"
+      "lea        0x10(%0,%3,8),%0                 \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "punpcklwd  %%xmm10,%%xmm8                   \n"
+      "punpcklwd  %%xmm11,%%xmm9                   \n"
+      "movdqa     %%xmm8,%%xmm10                   \n"
+      "movdqa     %%xmm9,%%xmm11                   \n"
+      "palignr    $0x8,%%xmm10,%%xmm10             \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "punpcklwd  %%xmm14,%%xmm12                  \n"
+      "punpcklwd  %%xmm15,%%xmm13                  \n"
+      "movdqa     %%xmm12,%%xmm14                  \n"
+      "movdqa     %%xmm13,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm12,%%xmm8                   \n"
+      "movq       %%xmm8,(%1)                      \n"
+      "movdqa     %%xmm8,%%xmm12                   \n"
+      "palignr    $0x8,%%xmm12,%%xmm12             \n"
+      "movq       %%xmm12,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm14,%%xmm10                  \n"
+      "movdqa     %%xmm10,%%xmm14                  \n"
+      "movq       %%xmm10,(%1)                     \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "punpckldq  %%xmm13,%%xmm9                   \n"
+      "movq       %%xmm14,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm9,%%xmm13                   \n"
+      "movq       %%xmm9,(%1)                      \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movq       %%xmm13,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm15,%%xmm11                  \n"
+      "movq       %%xmm11,(%1)                     \n"
+      "movdqa     %%xmm11,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "sub        $0x10,%2                         \n"
+      "movq       %%xmm15,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
+}
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%4),%%xmm1                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm1                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqu     (%0,%4),%%xmm3                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm3                    \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "movdqu     (%0,%4),%%xmm5                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm5                    \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "movdqu     (%0,%4),%%xmm7                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm8                    \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %4                               \n"
+      "lea        0x10(%0,%4,8),%0                 \n"
+      "punpckhbw  %%xmm7,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm7                    \n"
+      "neg        %4                               \n"
+      // Second round of bit swap.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "movdqa     %%xmm1,%%xmm9                    \n"
+      "punpckhwd  %%xmm2,%%xmm8                    \n"
+      "punpckhwd  %%xmm3,%%xmm9                    \n"
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm2                    \n"
+      "movdqa     %%xmm9,%%xmm3                    \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "movdqa     %%xmm5,%%xmm9                    \n"
+      "punpckhwd  %%xmm6,%%xmm8                    \n"
+      "punpckhwd  %%xmm7,%%xmm9                    \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm8,%%xmm6                    \n"
+      "movdqa     %%xmm9,%%xmm7                    \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+      "punpckhdq  %%xmm4,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movlpd     %%xmm2,(%1)                      \n"
+      "movhpd     %%xmm2,(%2)                      \n"
+      "punpckhdq  %%xmm6,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm1,%%xmm8                    \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movlpd     %%xmm1,(%1)                      \n"
+      "movhpd     %%xmm1,(%2)                      \n"
+      "punpckhdq  %%xmm5,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm3,%%xmm8                    \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movlpd     %%xmm3,(%1)                      \n"
+      "movhpd     %%xmm3,(%2)                      \n"
+      "punpckhdq  %%xmm7,%%xmm8                    \n"
+      "sub        $0x8,%3                          \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_msa.cc b/media/libyuv/libyuv/source/rotate_msa.cc
new file mode 100644
index 0000000000..99bdca65b3
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_msa.cc
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
+    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
+    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
+    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
+  }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
+    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
+    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
+    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
+  }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
+    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
+    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
+    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
+  }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
+    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
+  }
+
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width) {
+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 16) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    src += 16;
+    dst += dst_stride * 4;
+  }
+}
+
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 8) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    src += 16;
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libyuv/libyuv/source/rotate_neon.cc b/media/libyuv/libyuv/source/rotate_neon.cc
new file mode 100644
index 0000000000..fdc0dd476c
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_neon.cc
@@ -0,0 +1,416 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %5, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      "vld1.8      {d0}, [%0], %2              \n"
+      "vld1.8      {d1}, [%0], %2              \n"
+      "vld1.8      {d2}, [%0], %2              \n"
+      "vld1.8      {d3}, [%0], %2              \n"
+      "vld1.8      {d4}, [%0], %2              \n"
+      "vld1.8      {d5}, [%0], %2              \n"
+      "vld1.8      {d6}, [%0], %2              \n"
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+      "vst1.8      {d1}, [%0], %4              \n"
+      "vst1.8      {d0}, [%0], %4              \n"
+      "vst1.8      {d3}, [%0], %4              \n"
+      "vst1.8      {d2}, [%0], %4              \n"
+      "vst1.8      {d5}, [%0], %4              \n"
+      "vst1.8      {d4}, [%0], %4              \n"
+      "vst1.8      {d7}, [%0], %4              \n"
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %5, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %5, #4                        \n"
+      "blt         2f                            \n"
+
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.32     {d0[0]}, [%0], %2             \n"
+      "vld1.32     {d0[1]}, [%0], %2             \n"
+      "vld1.32     {d1[0]}, [%0], %2             \n"
+      "vld1.32     {d1[1]}, [%0], %2             \n"
+      "vld1.32     {d2[0]}, [%0], %2             \n"
+      "vld1.32     {d2[1]}, [%0], %2             \n"
+      "vld1.32     {d3[0]}, [%0], %2             \n"
+      "vld1.32     {d3[1]}, [%0]                 \n"
+
+      "mov         %0, %3                        \n"
+
+      "vld1.8      {q3}, [%6]                    \n"
+
+      "vtbl.8      d4, {d0, d1}, d6              \n"
+      "vtbl.8      d5, {d0, d1}, d7              \n"
+      "vtbl.8      d0, {d2, d3}, d6              \n"
+      "vtbl.8      d1, {d2, d3}, d7              \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "vst1.32     {d4[0]}, [%0], %4             \n"
+      "vst1.32     {d4[1]}, [%0], %4             \n"
+      "vst1.32     {d5[0]}, [%0], %4             \n"
+      "vst1.32     {d5[1]}, [%0]                 \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d0[0]}, [%0], %4             \n"
+      "vst1.32     {d0[1]}, [%0], %4             \n"
+      "vst1.32     {d1[0]}, [%0], %4             \n"
+      "vst1.32     {d1[1]}, [%0]                 \n"
+
+      "add         %1, #4                        \n"  // src += 4
+      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+      "subs        %5,  #4                       \n"  // w   -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld1.16     {d0[0]}, [%0], %2             \n"
+      "vld1.16     {d1[0]}, [%0], %2             \n"
+      "vld1.16     {d0[1]}, [%0], %2             \n"
+      "vld1.16     {d1[1]}, [%0], %2             \n"
+      "vld1.16     {d0[2]}, [%0], %2             \n"
+      "vld1.16     {d1[2]}, [%0], %2             \n"
+      "vld1.16     {d0[3]}, [%0], %2             \n"
+      "vld1.16     {d1[3]}, [%0]                 \n"
+
+      "vtrn.8      d0, d1                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d1}, [%0]                    \n"
+
+      "add         %1, #2                        \n"  // src += 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+      "subs        %5,  #2                       \n"  // w   -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld1.8      {d0[0]}, [%1], %2             \n"
+      "vld1.8      {d0[1]}, [%1], %2             \n"
+      "vld1.8      {d0[2]}, [%1], %2             \n"
+      "vld1.8      {d0[3]}, [%1], %2             \n"
+      "vld1.8      {d0[4]}, [%1], %2             \n"
+      "vld1.8      {d0[5]}, [%1], %2             \n"
+      "vld1.8      {d0[6]}, [%1], %2             \n"
+      "vld1.8      {d0[7]}, [%1]                 \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),         // %0
+        "+r"(src),               // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst),               // %3
+        "+r"(dst_stride),        // %4
+        "+r"(width)              // %5
+      : "r"(&kVTbl4x4Transpose)  // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
+                                          4, 12, 5, 13, 6, 14, 7, 15};
+
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %7, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+      "vst1.8      {d2},  [%0], %4             \n"
+      "vst1.8      {d0},  [%0], %4             \n"
+      "vst1.8      {d6},  [%0], %4             \n"
+      "vst1.8      {d4},  [%0], %4             \n"
+      "vst1.8      {d18}, [%0], %4             \n"
+      "vst1.8      {d16}, [%0], %4             \n"
+      "vst1.8      {d22}, [%0], %4             \n"
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+      "vst1.8      {d3},  [%0], %6             \n"
+      "vst1.8      {d1},  [%0], %6             \n"
+      "vst1.8      {d7},  [%0], %6             \n"
+      "vst1.8      {d5},  [%0], %6             \n"
+      "vst1.8      {d19}, [%0], %6             \n"
+      "vst1.8      {d17}, [%0], %6             \n"
+      "vst1.8      {d23}, [%0], %6             \n"
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %7, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %7, #4                        \n"
+      "blt         2f                            \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.64     {d0}, [%0], %2                \n"
+      "vld1.64     {d1}, [%0], %2                \n"
+      "vld1.64     {d2}, [%0], %2                \n"
+      "vld1.64     {d3}, [%0], %2                \n"
+      "vld1.64     {d4}, [%0], %2                \n"
+      "vld1.64     {d5}, [%0], %2                \n"
+      "vld1.64     {d6}, [%0], %2                \n"
+      "vld1.64     {d7}, [%0]                    \n"
+
+      "vld1.8      {q15}, [%8]                   \n"
+
+      "vtrn.8      q0, q1                        \n"
+      "vtrn.8      q2, q3                        \n"
+
+      "vtbl.8      d16, {d0, d1}, d30            \n"
+      "vtbl.8      d17, {d0, d1}, d31            \n"
+      "vtbl.8      d18, {d2, d3}, d30            \n"
+      "vtbl.8      d19, {d2, d3}, d31            \n"
+      "vtbl.8      d20, {d4, d5}, d30            \n"
+      "vtbl.8      d21, {d4, d5}, d31            \n"
+      "vtbl.8      d22, {d6, d7}, d30            \n"
+      "vtbl.8      d23, {d6, d7}, d31            \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.32     {d16[0]},  [%0], %4           \n"
+      "vst1.32     {d16[1]},  [%0], %4           \n"
+      "vst1.32     {d17[0]},  [%0], %4           \n"
+      "vst1.32     {d17[1]},  [%0], %4           \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d20[0]}, [%0], %4            \n"
+      "vst1.32     {d20[1]}, [%0], %4            \n"
+      "vst1.32     {d21[0]}, [%0], %4            \n"
+      "vst1.32     {d21[1]}, [%0]                \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.32     {d18[0]}, [%0], %6            \n"
+      "vst1.32     {d18[1]}, [%0], %6            \n"
+      "vst1.32     {d19[0]}, [%0], %6            \n"
+      "vst1.32     {d19[1]}, [%0], %6            \n"
+
+      "add         %0, %5, #4                    \n"
+      "vst1.32     {d22[0]},  [%0], %6           \n"
+      "vst1.32     {d22[1]},  [%0], %6           \n"
+      "vst1.32     {d23[0]},  [%0], %6           \n"
+      "vst1.32     {d23[1]},  [%0]               \n"
+
+      "add         %1, #4*2                      \n"  // src   += 4 * 2
+      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs        %7,  #4                       \n"  // w     -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+      "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+      "vtrn.8      d0, d1                        \n"
+      "vtrn.8      d2, d3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d2}, [%0]                    \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.64     {d1}, [%0], %6                \n"
+      "vst1.64     {d3}, [%0]                    \n"
+
+      "add         %1, #2*2                      \n"  // src   += 2 * 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs        %7,  #2                       \n"  // w     -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+      "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+      "vst1.64     {d1}, [%5]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),           // %0
+        "+r"(src),                 // %1
+        "+r"(src_stride),          // %2
+        "+r"(dst_a),               // %3
+        "+r"(dst_stride_a),        // %4
+        "+r"(dst_b),               // %5
+        "+r"(dst_stride_b),        // %6
+        "+r"(width)                // %7
+      : "r"(&kVTbl4x4TransposeDi)  // %8
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_neon64.cc b/media/libyuv/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000000..f469baacf6
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,426 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %w3, %w3, #8                     \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      "ld1        {v0.8b}, [%0], %5              \n"
+      "ld1        {v1.8b}, [%0], %5              \n"
+      "ld1        {v2.8b}, [%0], %5              \n"
+      "ld1        {v3.8b}, [%0], %5              \n"
+      "ld1        {v4.8b}, [%0], %5              \n"
+      "ld1        {v5.8b}, [%0], %5              \n"
+      "ld1        {v6.8b}, [%0], %5              \n"
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+      "st1      {v17.8b}, [%0], %6               \n"
+      "st1      {v16.8b}, [%0], %6               \n"
+      "st1      {v19.8b}, [%0], %6               \n"
+      "st1      {v18.8b}, [%0], %6               \n"
+      "st1      {v21.8b}, [%0], %6               \n"
+      "st1      {v20.8b}, [%0], %6               \n"
+      "st1      {v23.8b}, [%0], %6               \n"
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %w3, %w3, #8                  \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %w3, %w3, #8                    \n"
+      "b.eq        4f                              \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %w3, #2                          \n"
+      "b.lt        3f                              \n"
+
+      "cmp         %w3, #4                          \n"
+      "b.lt        2f                              \n"
+
+      // 4x8 block
+      "mov         %0, %1                          \n"
+      "ld1     {v0.s}[0], [%0], %5                 \n"
+      "ld1     {v0.s}[1], [%0], %5                 \n"
+      "ld1     {v0.s}[2], [%0], %5                 \n"
+      "ld1     {v0.s}[3], [%0], %5                 \n"
+      "ld1     {v1.s}[0], [%0], %5                 \n"
+      "ld1     {v1.s}[1], [%0], %5                 \n"
+      "ld1     {v1.s}[2], [%0], %5                 \n"
+      "ld1     {v1.s}[3], [%0]                     \n"
+
+      "mov         %0, %2                          \n"
+
+      "ld1      {v2.16b}, [%4]                     \n"
+
+      "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+      "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "st1 {v3.s}[0], [%0], %6                     \n"
+      "st1 {v3.s}[1], [%0], %6                     \n"
+      "st1 {v3.s}[2], [%0], %6                     \n"
+      "st1 {v3.s}[3], [%0]                         \n"
+
+      "add         %0, %2, #4                      \n"
+      "st1 {v0.s}[0], [%0], %6                     \n"
+      "st1 {v0.s}[1], [%0], %6                     \n"
+      "st1 {v0.s}[2], [%0], %6                     \n"
+      "st1 {v0.s}[3], [%0]                         \n"
+
+      "add         %1, %1, #4                      \n"  // src += 4
+      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+      "subs        %w3, %w3, #4                    \n"  // w   -= 4
+      "b.eq        4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %w3, #2                         \n"
+      "b.lt        3f                              \n"
+
+      // 2x8 block
+      "2:                                          \n"
+      "mov         %0, %1                          \n"
+      "ld1     {v0.h}[0], [%0], %5                 \n"
+      "ld1     {v1.h}[0], [%0], %5                 \n"
+      "ld1     {v0.h}[1], [%0], %5                 \n"
+      "ld1     {v1.h}[1], [%0], %5                 \n"
+      "ld1     {v0.h}[2], [%0], %5                 \n"
+      "ld1     {v1.h}[2], [%0], %5                 \n"
+      "ld1     {v0.h}[3], [%0], %5                 \n"
+      "ld1     {v1.h}[3], [%0]                     \n"
+
+      "trn2    v2.8b, v0.8b, v1.8b                 \n"
+      "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+      "mov         %0, %2                          \n"
+
+      "st1     {v3.8b}, [%0], %6                   \n"
+      "st1     {v2.8b}, [%0]                       \n"
+
+      "add         %1, %1, #2                      \n"  // src += 2
+      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+      "subs        %w3, %w3,  #2                   \n"  // w   -= 2
+      "b.eq        4f                              \n"
+
+      // 1x8 block
+      "3:                                          \n"
+      "ld1         {v0.b}[0], [%1], %5             \n"
+      "ld1         {v0.b}[1], [%1], %5             \n"
+      "ld1         {v0.b}[2], [%1], %5             \n"
+      "ld1         {v0.b}[3], [%1], %5             \n"
+      "ld1         {v0.b}[4], [%1], %5             \n"
+      "ld1         {v0.b}[5], [%1], %5             \n"
+      "ld1         {v0.b}[6], [%1], %5             \n"
+      "ld1         {v0.b}[7], [%1]                 \n"
+
+      "st1         {v0.8b}, [%2]                   \n"
+
+      "4:                                          \n"
+
+      : "=&r"(src_temp),                          // %0
+        "+r"(src),                                // %1
+        "+r"(dst),                                // %2
+        "+r"(width)                               // %3
+      : "r"(&kVTbl4x4Transpose),                  // %4
+        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
+static const uint8_t kVTbl4x4TransposeDi[32] = {
+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub       %w4, %w4, #8                    \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+
+      "ld1       {v0.16b}, [%0], %5              \n"
+      "ld1       {v1.16b}, [%0], %5              \n"
+      "ld1       {v2.16b}, [%0], %5              \n"
+      "ld1       {v3.16b}, [%0], %5              \n"
+      "ld1       {v4.16b}, [%0], %5              \n"
+      "ld1       {v5.16b}, [%0], %5              \n"
+      "ld1       {v6.16b}, [%0], %5              \n"
+      "ld1       {v7.16b}, [%0]                  \n"
+
+      "trn1      v16.16b, v0.16b, v1.16b         \n"
+      "trn2      v17.16b, v0.16b, v1.16b         \n"
+      "trn1      v18.16b, v2.16b, v3.16b         \n"
+      "trn2      v19.16b, v2.16b, v3.16b         \n"
+      "trn1      v20.16b, v4.16b, v5.16b         \n"
+      "trn2      v21.16b, v4.16b, v5.16b         \n"
+      "trn1      v22.16b, v6.16b, v7.16b         \n"
+      "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+      "trn1      v0.8h, v16.8h, v18.8h           \n"
+      "trn2      v1.8h, v16.8h, v18.8h           \n"
+      "trn1      v2.8h, v20.8h, v22.8h           \n"
+      "trn2      v3.8h, v20.8h, v22.8h           \n"
+      "trn1      v4.8h, v17.8h, v19.8h           \n"
+      "trn2      v5.8h, v17.8h, v19.8h           \n"
+      "trn1      v6.8h, v21.8h, v23.8h           \n"
+      "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+      "trn1      v16.4s, v0.4s, v2.4s            \n"
+      "trn2      v17.4s, v0.4s, v2.4s            \n"
+      "trn1      v18.4s, v1.4s, v3.4s            \n"
+      "trn2      v19.4s, v1.4s, v3.4s            \n"
+      "trn1      v20.4s, v4.4s, v6.4s            \n"
+      "trn2      v21.4s, v4.4s, v6.4s            \n"
+      "trn1      v22.4s, v5.4s, v7.4s            \n"
+      "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v16.d}[0], [%0], %6            \n"
+      "st1       {v18.d}[0], [%0], %6            \n"
+      "st1       {v17.d}[0], [%0], %6            \n"
+      "st1       {v19.d}[0], [%0], %6            \n"
+      "st1       {v16.d}[1], [%0], %6            \n"
+      "st1       {v18.d}[1], [%0], %6            \n"
+      "st1       {v17.d}[1], [%0], %6            \n"
+      "st1       {v19.d}[1], [%0]                \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v20.d}[0], [%0], %7            \n"
+      "st1       {v22.d}[0], [%0], %7            \n"
+      "st1       {v21.d}[0], [%0], %7            \n"
+      "st1       {v23.d}[0], [%0], %7            \n"
+      "st1       {v20.d}[1], [%0], %7            \n"
+      "st1       {v22.d}[1], [%0], %7            \n"
+      "st1       {v21.d}[1], [%0], %7            \n"
+      "st1       {v23.d}[1], [%0]                \n"
+
+      "add       %1, %1, #16                     \n"  // src   += 8*2
+      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs      %w4, %w4,  #8                   \n"  // w     -= 8
+      "b.ge      1b                              \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds      %w4, %w4, #8                    \n"
+      "b.eq      4f                              \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
+
+      "cmp       %w4, #4                         \n"
+      "b.lt      2f                              \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov       %0, %1                          \n"
+      "ld1       {v0.8b}, [%0], %5               \n"
+      "ld1       {v1.8b}, [%0], %5               \n"
+      "ld1       {v2.8b}, [%0], %5               \n"
+      "ld1       {v3.8b}, [%0], %5               \n"
+      "ld1       {v4.8b}, [%0], %5               \n"
+      "ld1       {v5.8b}, [%0], %5               \n"
+      "ld1       {v6.8b}, [%0], %5               \n"
+      "ld1       {v7.8b}, [%0]                   \n"
+
+      "ld1       {v30.16b}, [%8], #16            \n"
+      "ld1       {v31.16b}, [%8]                 \n"
+
+      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v16.s}[0],  [%0], %6           \n"
+      "st1       {v16.s}[1],  [%0], %6           \n"
+      "st1       {v16.s}[2],  [%0], %6           \n"
+      "st1       {v16.s}[3],  [%0], %6           \n"
+
+      "add       %0, %2, #4                      \n"
+      "st1       {v18.s}[0], [%0], %6            \n"
+      "st1       {v18.s}[1], [%0], %6            \n"
+      "st1       {v18.s}[2], [%0], %6            \n"
+      "st1       {v18.s}[3], [%0]                \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v17.s}[0], [%0], %7            \n"
+      "st1       {v17.s}[1], [%0], %7            \n"
+      "st1       {v17.s}[2], [%0], %7            \n"
+      "st1       {v17.s}[3], [%0], %7            \n"
+
+      "add       %0, %3, #4                      \n"
+      "st1       {v19.s}[0],  [%0], %7           \n"
+      "st1       {v19.s}[1],  [%0], %7           \n"
+      "st1       {v19.s}[2],  [%0], %7           \n"
+      "st1       {v19.s}[3],  [%0]               \n"
+
+      "add       %1, %1, #8                      \n"  // src   += 4 * 2
+      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4
+      "b.eq      4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov       %0, %1                          \n"
+      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+      "trn1      v4.8b, v0.8b, v2.8b             \n"
+      "trn2      v5.8b, v0.8b, v2.8b             \n"
+      "trn1      v6.8b, v1.8b, v3.8b             \n"
+      "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v4.d}[0], [%0], %6             \n"
+      "st1       {v6.d}[0], [%0]                 \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v5.d}[0], [%0], %7             \n"
+      "st1       {v7.d}[0], [%0]                 \n"
+
+      "add       %1, %1, #4                      \n"  // src   += 2 * 2
+      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2
+      "b.eq      4f                              \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+      "st1       {v0.d}[0], [%2]                 \n"
+      "st1       {v1.d}[0], [%3]                 \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),                            // %0
+        "+r"(src),                                  // %1
+        "+r"(dst_a),                                // %2
+        "+r"(dst_b),                                // %3
+        "+r"(width)                                 // %4
+      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/rotate_win.cc b/media/libyuv/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000000..e887dd525c
--- /dev/null
+++ b/media/libyuv/libyuv/source/rotate_win.cc
@@ -0,0 +1,252 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
+                                          int src_stride,
+                                          uint8_t* dst,
+                                          int dst_stride,
+                                          int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]  // src
+    mov       edi, [esp + 12 + 8]  // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
+                                           int src_stride,
+                                           uint8_t* dst_a,
+                                           int dst_stride_a,
+                                           uint8_t* dst_b,
+                                           int dst_stride_b,
+                                           int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]  // src
+    mov       edi, [esp + 16 + 8]  // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+  convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6  // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+        // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5  // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+
+        // Third round of bit swap.
+        // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_any.cc b/media/libyuv/libyuv/source/row_any.cc
new file mode 100644
index 0000000000..cc5914dd29
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_any.cc
@@ -0,0 +1,1189 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
+    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
+    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(temp, y_buf + n, r);                                              \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+    memcpy(temp + 192, a_buf + n, r);                                        \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+             yuvconstants, MASK + 1);                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+           SS(r, DUVSHIFT) * BPP);                                           \
+  }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
+    }                                                               \
+    memcpy(temp, y_buf + n, r);                                     \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
+           SS(r, DUVSHIFT) * BPP);                                  \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                     \
+               const uint8_t* v_buf, uint8_t* dst_ptr,                         \
+               const struct YuvConstants* yuvconstants, int width) {           \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                                        \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                           \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);                 \
+    }                                                                          \
+    memcpy(temp, y_buf + n, r);                                                \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                 \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    if (width & 1) {                                                           \
+      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];               \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];             \
+    }                                                                          \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                        \
+           SS(r, DUVSHIFT) * BPP);                                             \
+  }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#undef ANY31C
+
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T temp[16 * 3]);                                         \
+    SIMD_ALIGNED(uint8_t out[64]);                                        \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(temp, y_buf + n, r * SBPP);                                    \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+  }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);            \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
+  }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended.  Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+  }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
+    memset(temp, 0, 64); /* for msan */                                        \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+  }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(STYPE temp[32]);                                            \
+    SIMD_ALIGNED(DTYPE out[32]);                                             \
+    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
+    }                                                                        \
+    memcpy(temp, src_ptr + n, r * SBPP);                                     \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, out, r * BPP);                                       \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+       Convert16To8Row_SSSE3,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST temp[32]);                                          \
+    SIMD_ALIGNED(T out[32]);                                            \
+    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(temp, src_ptr + n, r * SBPP);                                \
+    ANY_SIMD(temp, out, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, out, r * BPP);                                  \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
+    memset(temp, 0, 64 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+    }                                                                        \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+  }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64); /* for msan */                                   \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
+    }                                                                     \
+    memcpy(temp, src_ptr, r* BPP);                                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+  }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64]);                  \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(temp, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+  }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
+    memset(temp, 0, 128); /* for msan */                                \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
+    }                                                                   \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+  }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#undef ANY12
+
+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
+               uint8_t* dst_b, int width) {                                \
+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
+    memset(temp, 0, 16 * 3); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
+    }                                                                      \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+  }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_common.cc b/media/libyuv/libyuv/source/row_common.cc
new file mode 100644
index 0000000000..04b5caa275
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_common.cc
@@ -0,0 +1,3219 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <stdio.h>
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
+  return ((-(v) >> 31) & (v));
+}
+
+static __inline int32_t clamp255(int32_t v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+  return (((1023 - (v)) >> 31) | (v)) & 1023;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else   // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32_t clamp255(int32_t v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+  return (v > 1023) ? 1023 : v;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp255(v));
+}
+
+static __inline uint32_t Clamp10(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp1023(v));
+}
+
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
+#else
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+  p[0] = (uint8_t)(v & 255);
+  p[1] = (uint8_t)((v >> 8) & 255);
+  p[2] = (uint8_t)((v >> 16) & 255);
+  p[3] = (uint8_t)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    dst_rgb24 += 3;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+                       uint8_t* dst_argb,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb565 += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t a = src_argb1555[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_argb1555 += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
+    uint8_t a = src_argb4444[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_argb4444 += 2;
+  }
+}
+
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+    dst_argb += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+    dst_abgr += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(uint32_t*)src_ar30;
+    uint32_t b = ar30 & 0x3ff;
+    uint32_t ga = ar30 & 0xc00ffc00;
+    uint32_t r = (ar30 >> 20) & 0x3ff;
+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+    dst_ab30 += 4;
+    src_ar30 += 4;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 2;
+    uint8_t r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 3;
+    uint8_t r1 = src_argb[6] >> 3;
+    uint8_t a1 = src_argb[7] >> 7;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    uint8_t b1 = src_argb[4] >> 4;
+    uint8_t g1 = src_argb[5] >> 4;
+    uint8_t r1 = src_argb[6] >> 4;
+    uint8_t a1 = src_argb[7] >> 4;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t a0 = (src_abgr[3] >> 6);
+    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_abgr += 4;
+  }
+}
+
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+    uint32_t a0 = (src_argb[3] >> 6);
+    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_argb += 4;
+  }
+}
+
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                    src_rgb1[B + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                    src_rgb1[G + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                    src_rgb1[R + BPP]) >>                                    \
+                   2;                                                        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
+      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
+      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+    }                                                                        \
+  }
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+    }                                                                         \
+  }
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b1 = src_rgb565[2] & 0x1f;
+    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8_t r1 = src_rgb565[3] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b3 = next_rgb565[2] & 0x1f;
+    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8_t r3 = next_rgb565[3] >> 3;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b = (b0 + b2);  // 565 * 2 = 676.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b1 = src_argb1555[2] & 0x1f;
+    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8_t b3 = next_argb1555[2] & 0x1f;
+    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = next_argb1555[1] >> 3;
+    uint8_t b = (b0 + b2);  // 555 * 2 = 666.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b1 = src_argb4444[2] & 0x0f;
+    uint8_t g1 = src_argb4444[2] >> 4;
+    uint8_t r1 = src_argb4444[3] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b3 = next_argb4444[2] & 0x0f;
+    uint8_t g3 = next_argb4444[2] >> 4;
+    uint8_t r3 = next_argb4444[3] & 0x0f;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b = (b0 + b2);  // 444 * 2 = 555.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    dst_argb[0] = sb;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+              a * matrix_argb[3]) >>
+             6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+              a * matrix_argb[7]) >>
+             6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+              a * matrix_argb[11]) >>
+             6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+              a * matrix_argb[15]) >>
+             6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 24
+
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value) {
+  const uint32_t b_scale = REPEAT8(value & 0xff);
+  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32_t a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 16
+
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32_t b = REPEAT8(src_argb0[0]);
+    const uint32_t g = REPEAT8(src_argb0[1]);
+    const uint32_t r = REPEAT8(src_argb0[2]);
+    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b_scale = src_argb1[0];
+    const uint32_t g_scale = src_argb1[1];
+    const uint32_t r_scale = src_argb1[2];
+    const uint32_t a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8_t)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8_t)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8_t)(s);
+    dst_argb[1] = (uint8_t)(s);
+    dst_argb[2] = (uint8_t)(s);
+    dst_argb[3] = (uint8_t)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8_t)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8_t)(b);
+    dst_argb[1] = (uint8_t)(g);
+    dst_argb[2] = (uint8_t)(r);
+    dst_argb[3] = (uint8_t)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  // Copy a Y to RGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25   /* round(0.391 * 64) */
+#define VG 52   /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)  // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)  // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32   /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22   /* round(0.34414 * 64) */
+#define VG 46   /* round(0.71414  * 64) */
+#define VR -90  /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.793
+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
+//  B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14   /* round(0.213 * 64) */
+#define VG 34   /* round(0.533  * 64) */
+#define VR -115 /* round(-1.793 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.2020 YUV to RGB reference
+//  R = (Y - 16) * 1.164384                - V * -1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * -0.65042
+//  B = (Y - 16) * 1.164384 - U * -2.14177
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003  /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+#define UB -128 /* max(-128, round(-2.142 * 64)) */
+#define UG 12   /* round(0.187326 * 64) */
+#define VG 42   /* round(0.65042 * 64) */
+#define VR -107 /* round(-1.67867 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 16 bit.
+
+static __inline void YuvPixel(uint8_t y,
+                              uint8_t u,
+                              uint8_t v,
+                              uint8_t* b,
+                              uint8_t* g,
+                              uint8_t* r,
+                              const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+                                  uint8_t u,
+                                  uint8_t v,
+                                  int* b,
+                                  int* g,
+                                  int* r,
+                                  const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+                                int16_t u,
+                                int16_t v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
+}
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
+  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32_t)(y1 + YGB) >> 6);
+  *g = Clamp((int32_t)(y1 + YGB) >> 6);
+  *r = Clamp((int32_t)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+             yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+             yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+  uint32_t ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+  (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = src_a[1];
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+  }
+}
+
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+  }
+}
+
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+                                 (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+             rgb_buf + 7, yuvconstants);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  const uint32_t* src32 = (const uint32_t*)(src);
+  uint32_t* dst32 = (uint32_t*)(dst);
+  src32 += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_r[x] = src_rgb[0];
+    dst_g[x] = src_rgb[1];
+    dst_b[x] = src_rgb[2];
+    src_rgb += 3;
+  }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_rgb[0] = src_r[x];
+    dst_rgb[1] = src_g[x];
+    dst_rgb[2] = src_b[x];
+    dst_rgb += 3;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] * scale;
+    dst_uv[1] = src_v[x] * scale;
+    dst_uv[2] = src_u[x + 1] * scale;
+    dst_uv[3] = src_v[x + 1] * scale;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] * scale;
+    dst_uv[1] = src_v[width - 1] * scale;
+  }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  scale *= 0x0101;  // replicates the byte.
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
+  memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
+  uint32_t* d = (uint32_t*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+    src0 += 2;
+    src1 += 2;
+    alpha += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+  }
+}
+#undef UBLEND
+
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    uint32_t a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32_t b = src_argb[0];
+    const uint32_t g = src_argb[1];
+    const uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32_t fixed_invtbl8[256] = {
+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
+    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width) {
+  int32_t row_sum[4] = {0, 0, 0, 0};
+  int x;
+  for (x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count) {
+  float ooa = 1.0f / area;
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width) {
+  int i;
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32_t*)(dst_argb) =
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8_t* src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8_t* dst_uv,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16_t* src_uv,
+                         ptrdiff_t src_uv_stride,
+                         uint16_t* dst_uv,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (y1_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (y1_fraction == 128) {
+    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[1] =
+        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8_t b = src_argb[index0];
+    uint8_t g = src_argb[index1];
+    uint8_t r = src_argb[index2];
+    uint8_t a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
+}
+
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32_t)(db));
+    dst_argb[1] = Clamp((int32_t)(dg));
+    dst_argb[2] = Clamp((int32_t)(dr));
+    dst_argb[3] = Clamp((int32_t)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+void HalfFloatRow_C(const uint16_t* src,
+                    uint16_t* dst,
+                    float scale,
+                    int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16_t)((*(uint32_t*)&value) >> 13);
+  }
+}
+
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * scale;
+    dst[i] = value;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff) {
+  uint32_t bc = lumacoeff & 0xff;
+  uint32_t gc = (lumacoeff >> 8) & 0xff;
+  uint32_t rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    const uint8_t* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 =
+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+        luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst_a[0] = src_argb[3];
+    dst_a[1] = src_argb[7];
+    dst_a += 2;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    dst_a[0] = src_argb[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+  float fsum = 0.f;
+  int i;
+#if defined(__clang__)
+#pragma clang loop vectorize_width(4)
+#endif
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    fsum += v * v;
+    *dst++ = v * scale;
+  }
+  return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    float vs = v * scale;
+    fmax = (v > fmax) ? v : fmax;
+    *dst++ = vs;
+  }
+  return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src++ * scale;
+  }
+}
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ =
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_gcc.cc b/media/libyuv/libyuv/source/row_gcc.cc
new file mode 100644
index 0000000000..95845c2592
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_gcc.cc
@@ -0,0 +1,6534 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
+
+// Constants for BGRA
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm3                       \n"
+      "movdqa     %4,%%xmm4                       \n"
+      "movdqa     %5,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x4(%0),%%xmm1                  \n"
+      "movdqu    0x8(%0),%%xmm2                  \n"
+      "lea       0x18(%0),%0                     \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x20802080,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xa,%%xmm4                     \n"
+      "psrlw     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x42004200,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "movdqa    %%xmm3,%%xmm4                   \n"
+      "psrlw     $0x6,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "psllw     $0x1,%%xmm1                     \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "pand      %%xmm7,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0xf0f0f0f,%%eax                \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x4,%%xmm5                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "psllw     $0x4,%%xmm1                     \n"
+      "psrlw     $0x4,%%xmm3                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm2,%%xmm0                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psrld     $0x1b,%%xmm3                    \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1a,%%xmm4                    \n"
+      "pslld     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0xb,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pslld     $0x8,%%xmm0                     \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x5,%%xmm2                     \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm2                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd       %3,%%xmm6                      \n"
+      "punpcklbw  %%xmm6,%%xmm6                  \n"
+      "movdqa     %%xmm6,%%xmm7                  \n"
+      "punpcklwd  %%xmm6,%%xmm6                  \n"
+      "punpckhwd  %%xmm7,%%xmm7                  \n"
+      "pcmpeqb    %%xmm3,%%xmm3                  \n"
+      "psrld      $0x1b,%%xmm3                   \n"
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrld      $0x1a,%%xmm4                   \n"
+      "pslld      $0x5,%%xmm4                    \n"
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "pslld      $0xb,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "paddusb    %%xmm6,%%xmm0                  \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "pslld      $0x8,%%xmm0                    \n"
+      "psrld      $0x3,%%xmm1                    \n"
+      "psrld      $0x5,%%xmm2                    \n"
+      "psrad      $0x10,%%xmm0                   \n"
+      "pand       %%xmm3,%%xmm1                  \n"
+      "pand       %%xmm4,%%xmm2                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm1                  \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "packssdw   %%xmm0,%%xmm0                  \n"
+      "lea        0x10(%0),%0                    \n"
+      "movq       %%xmm0,(%1)                    \n"
+      "lea        0x8(%1),%1                     \n"
+      "sub        $0x4,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1b,%%xmm4                    \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x5,%%xmm5                     \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "pslld     $0xa,%%xmm6                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "pslld     $0xf,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x6,%%xmm2                     \n"
+      "psrld     $0x9,%%xmm3                     \n"
+      "pand      %%xmm7,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "pand      %%xmm6,%%xmm3                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm4,%%xmm3                   \n"
+      "psrlw     $0x8,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm3,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "psrlq     $0x4,%%xmm0                     \n"
+      "psrlq     $0x8,%%xmm1                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
+would be a simple multiplier to shift it into position.  It wants a gap of 10
+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kAddY16),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kAddYJ64),          // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea        0x80(%0),%0                    \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea       0x80(%0),%0                     \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUVJ128),                   // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kAddUVJ128)                    // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm3                       \n"
+      "movdqa    %5,%%xmm4                       \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
+  "psraw      $0x2,%%xmm0                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     %%xmm11,%%xmm0                                  \n" \
+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm12,%%xmm1                                  \n" \
+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
+#define YUVTORGB_REGS \
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
+#define YUVTORGB_REGS
+#endif
+
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values.
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
+
+// Store 8 RGBA values.
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
+
+// Store 8 AR30 values.
+#define STOREAR30                                                  \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0,(%[dst_rgb24])           \n"
+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READNV12
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* vu_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READNV21
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUY2
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READUYVY
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_REGS_AVX2 \
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else  // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  YUVTORGB16_AVX2(yuvconstants)                                       \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2                                                \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV444_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "subl      $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READNV12_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* vu_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUY2_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READUYVY_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "movd      %%eax,%%xmm2                    \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "movd      %%eax,%%xmm3                    \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "psubusw   %%xmm3,%%xmm0                   \n"
+      "psrlw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "vmovd      %%eax,%%xmm2                   \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "vmovd      %%eax,%%xmm3                   \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "movdqa    %3,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8_t* src,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+      "movdqa    %4,%%xmm1                       \n"
+      "lea       -0x10(%0,%3,2),%0               \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       -0x10(%0),%0                    \n"
+      "pshufb    %%xmm1,%%xmm0                   \n"
+      "movlpd    %%xmm0,(%1)                     \n"
+      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $8,%3                           \n"
+      "jg        1b                              \n"
+      : "+r"(src),             // %0
+        "+r"(dst_u),           // %1
+        "+r"(dst_v),           // %2
+        "+r"(temp_width)       // %3
+      : "m"(kShuffleMirrorUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "lea       -0x10(%0,%2,4),%0               \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+      "lea       -0x10(%0),%0                    \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vmovdqu    %3,%%ymm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psrlw      $0x8,%%xmm5                    \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "lea        0x20(%0),%0                    \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "movdqa     %%xmm1,%%xmm3                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "pand       %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%3                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
+      "lea       0x20(%0),%0                     \n"
+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
+      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm2                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %4,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   (%0,%1,1),%%ymm1                \n"
+    "add        $0x20,%0                       \n"
+
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
+    "vextractf128 $0x0,%%ymm2,(%2)             \n"
+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+    "add       $0x40,%2                        \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_u),   // %0
+    "+r"(src_v),   // %1
+    "+r"(dst_uv),  // %2
+    "+r"(width)    // %3
+  : "r"(scale)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%0,%1)                  \n"
+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
+    "add        $0x40,%0                       \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "add       $0x20,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "add       $0x40,%0                        \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "add       $0x20,%1                        \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqa    %%xmm0,%%xmm1                  \n"
+    "punpcklbw %%xmm0,%%xmm0                  \n"
+    "punpckhbw %%xmm1,%%xmm1                  \n"
+    "add       $0x10,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "movdqu    %%xmm1,0x10(%1)                \n"
+    "add       $0x20,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "add       $0x20,%0                        \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "vmovdqu   %%ymm1,0x20(%1)                 \n"
+    "add       $0x40,%1                        \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+                                          3u,   6u,   9u,   12u,  15u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 2u,
+                                          5u,   8u,   11u,  14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u,  128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 0u,   3u,
+                                          6u,   9u,   12u,  15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%2)                    \n"
+      "lea        0x10(%2),%2                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+      "lea        0x10(%3),%3                    \n"
+      "lea        0x30(%0),%0                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRGBToR0),  // %5
+        "m"(kShuffleMaskRGBToR1),  // %6
+        "m"(kShuffleMaskRGBToR2),  // %7
+        "m"(kShuffleMaskRGBToG0),  // %8
+        "m"(kShuffleMaskRGBToG1),  // %9
+        "m"(kShuffleMaskRGBToG2),  // %10
+        "m"(kShuffleMaskRGBToB0),  // %11
+        "m"(kShuffleMaskRGBToB1),  // %12
+        "m"(kShuffleMaskRGBToB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+                                          2u, 128u, 128u, 3u, 128u, 128u,
+                                          4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+                                          128u, 2u, 128u, 128u, 3u, 128u,
+                                          128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+                                          128u, 128u, 2u, 128u, 128u, 3u,
+                                          128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+                                          7u, 128u, 128u, 8u, 128u, 128u,
+                                          9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+                                          128u, 7u, 128u, 128u, 8u, 128u,
+                                          128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
+                                          128u, 128u, 8u,  128u, 128u, 9u,
+                                          128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+                                          12u, 128u, 128u, 13u, 128u, 128u,
+                                          14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+                                          128u, 13u, 128u, 128u, 14u, 128u,
+                                          128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+                                          128u, 128u, 13u, 128u, 128u, 14u,
+                                          128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,16(%3)                  \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,32(%3)                  \n"
+
+      "lea        0x10(%0),%0                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "lea        0x10(%2),%2                    \n"
+      "lea        0x30(%3),%3                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRToRGB0),  // %5
+        "m"(kShuffleMaskGToRGB0),  // %6
+        "m"(kShuffleMaskBToRGB0),  // %7
+        "m"(kShuffleMaskRToRGB1),  // %8
+        "m"(kShuffleMaskGToRGB1),  // %9
+        "m"(kShuffleMaskBToRGB1),  // %10
+        "m"(kShuffleMaskRToRGB2),  // %11
+        "m"(kShuffleMaskGToRGB2),  // %12
+        "m"(kShuffleMaskBToRGB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "test       $0xf,%0                        \n"
+      "jne        2f                             \n"
+      "test       $0xf,%1                        \n"
+      "jne        2f                             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa    (%0),%%xmm0                     \n"
+      "movdqa    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,(%1)                     \n"
+      "movdqa    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       9f                              \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        2b                              \n"
+
+      LABELALIGN "9:                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x40,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep movsb                      \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm1                     \n"
+      "vmovdqu   0x20(%0),%%ymm2                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0), %%xmm0                    \n"
+      "movdqu    0x10(%0), %%xmm1                \n"
+      "lea       0x20(%0), %0                    \n"
+      "psrld     $0x18, %%xmm0                   \n"
+      "psrld     $0x18, %%xmm1                   \n"
+      "packssdw  %%xmm1, %%xmm0                  \n"
+      "packuswb  %%xmm0, %%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1), %1                     \n"
+      "sub       $0x8, %2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm4                      \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0), %%ymm0                    \n"
+      "vmovdqu   0x20(%0), %%ymm1                \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
+      "vmovdqu   0x40(%0), %%ymm2                \n"
+      "vmovdqu   0x60(%0), %%ymm3                \n"
+      "lea       0x80(%0), %0                    \n"
+      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub        $0x20, %2                      \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm2                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpckhwd %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm2,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd (%0),%%ymm1                     \n"
+      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
+      "lea       0x10(%0),%0                     \n"
+      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep stosb                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $0xf,%%xmm7                     \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x8,%%xmm6                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psllw     $0x8,%%xmm5                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        99f                             \n"
+
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd      (%0),%%xmm3                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movd      (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movd      (%1),%%xmm1                     \n"
+      "lea       0x4(%1),%1                      \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       91b                             \n"
+      "99:                                       \n"
+      : "+r"(src_argb0),    // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psllw      $0x8,%%xmm5                    \n"
+      "mov        $0x80808080,%%eax              \n"
+      "movd       %%eax,%%xmm6                   \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "movd       %%eax,%%xmm7                   \n"
+      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq       (%2),%%xmm0                    \n"
+      "punpcklbw  %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm5,%%xmm0                  \n"
+      "movq       (%0,%2,1),%%xmm1               \n"
+      "movq       (%1,%2,1),%%xmm2               \n"
+      "punpcklbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm6,%%xmm1                  \n"
+      "pmaddubsw  %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm7,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq       %%xmm0,(%3,%2,1)               \n"
+      "lea        0x8(%2),%2                     \n"
+      "sub        $0x8,%4                        \n"
+      "jg        1b                              \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm6                   \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "vmovd      %%eax,%%xmm7                   \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
+
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%2),%%ymm0                    \n"
+      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vmovdqu    (%0,%2,1),%%ymm1               \n"
+      "vmovdqu    (%1,%2,1),%%ymm2               \n"
+      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "pslld     $0x18,%%xmm3                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpcklbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm1,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "punpckhbw %%xmm2,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pand      %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(dst_argb),       // %1
+        "+r"(width)           // %2
+      : "m"(kShuffleAlpha0),  // %3
+        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+      "sub        %0,%1                          \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movzb     0x03(%0),%3                     \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile(
+      "sub        %0,%1                          \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb     0x03(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "movzb     0x13(%0),%3                     \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x17(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x1b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x1f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
+
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrld     $0x18,%%xmm2                    \n"
+      "psrld     $0x18,%%xmm3                    \n"
+      "packuswb  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpcklbw %%xmm2,%%xmm3                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %2,%%xmm2                       \n"
+      "movdqa    %3,%%xmm3                       \n"
+      "movdqa    %4,%%xmm4                       \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm6                   \n"
+      "phaddw    %%xmm6,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm5                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm5                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "psrld     $0x18,%%xmm6                    \n"
+      "psrld     $0x18,%%xmm1                    \n"
+      "packuswb  %%xmm1,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm5                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "punpckhwd %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width) {
+  asm volatile(
+      "movdqu    (%3),%%xmm5                     \n"
+      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm7                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddsw   %%xmm7,%%xmm0                   \n"
+      "phaddsw   %%xmm1,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm0                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm1                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "punpcklwd %%xmm1,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm6                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm6,0x10(%1)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "movd      %2,%%xmm2                       \n"
+      "movd      %3,%%xmm3                       \n"
+      "movd      %4,%%xmm4                       \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "pslld     $0x18,%%xmm6                    \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpckhbw %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "pmullw    %%xmm3,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm7                     \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm6,%%xmm7                   \n"
+      "paddw     %%xmm4,%%xmm0                   \n"
+      "paddw     %%xmm4,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x4,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "movd      %3,%%xmm2                       \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqu    %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm2,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm3,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm1                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    (%1),%%ymm3                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc"
+#if defined(__AVX2__)
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+      );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psubusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "sub       %0,%3                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x2(%0),%%xmm1                  \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "movq      0x02(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x00(%0,%2,1),%%xmm2            \n"
+      "movq      0x02(%0,%2,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%3,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%4                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x1(%0),%%xmm1                  \n"
+      "movq      0x01(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x2(%0),%%xmm2                  \n"
+      "movq      0x02(%0,%1,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%2,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm2                   \n"
+      "punpckhbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "punpcklwd %%xmm2,%%xmm1                   \n"
+      "punpckhwd %%xmm2,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklwd %%xmm0,%%xmm3                   \n"
+      "punpckhwd %%xmm0,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm1,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "movdqu    %%xmm3,0x20(%2)                 \n"
+      "movdqu    %%xmm0,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "paddusb   %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "punpckhbw %%xmm5,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "punpcklbw %%xmm2,%%xmm4                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "punpcklwd %%xmm3,%%xmm6                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "punpcklwd %%xmm0,%%xmm7                   \n"
+      "punpckhwd %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm6,(%2)                     \n"
+      "movdqu    %%xmm4,0x10(%2)                 \n"
+      "movdqu    %%xmm7,0x20(%2)                 \n"
+      "movdqu    %%xmm1,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "test      $0xf,%1                         \n"
+      "jne       49f                             \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "punpckhwd %%xmm1,%%xmm3                   \n"
+      "punpckhbw %%xmm1,%%xmm4                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "punpcklwd %%xmm1,%%xmm4                   \n"
+      "punpckhwd %%xmm1,%%xmm5                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "movdqu    0x10(%2),%%xmm3                 \n"
+      "paddd     %%xmm0,%%xmm3                   \n"
+      "paddd     %%xmm4,%%xmm0                   \n"
+      "movdqu    0x20(%2),%%xmm4                 \n"
+      "paddd     %%xmm0,%%xmm4                   \n"
+      "paddd     %%xmm5,%%xmm0                   \n"
+      "movdqu    0x30(%2),%%xmm5                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm5                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "movdqu    %%xmm4,0x20(%1)                 \n"
+      "movdqu    %%xmm5,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd      (%0),%%xmm2                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count) {
+  asm volatile(
+      "movd      %5,%%xmm5                       \n"
+      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+      "rcpss     %%xmm5,%%xmm4                   \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "cmpl      $0x80,%5                        \n"
+      "ja        40f                             \n"
+
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrld     $0x10,%%xmm6                    \n"
+      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+      "addps     %%xmm6,%%xmm5                   \n"
+      "mulps     %%xmm4,%%xmm5                   \n"
+      "cvtps2dq  %%xmm5,%%xmm5                   \n"
+      "packssdw  %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       4b                              \n"
+      "jmp       49f                             \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm1                   \n"
+      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+      "mulps     %%xmm4,%%xmm2                   \n"
+      "mulps     %%xmm4,%%xmm3                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "cvtps2dq  %%xmm1,%%xmm1                   \n"
+      "cvtps2dq  %%xmm2,%%xmm2                   \n"
+      "cvtps2dq  %%xmm3,%%xmm3                   \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "lea       0x10(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "lea       0x10(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp;
+  asm volatile(
+      "movq      (%3),%%xmm2                     \n"
+      "movq      0x08(%3),%%xmm7                 \n"
+      "shl       $0x10,%1                        \n"
+      "add       $0x4,%1                         \n"
+      "movd      %1,%%xmm5                       \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
+
+      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm0                   \n"
+      "movlhps   %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm7,%%xmm4                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
+      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
+      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm1                   \n"
+      "addps     %%xmm4,%%xmm2                   \n"
+      "movq      %%xmm1,(%2)                     \n"
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm0                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "movq      %%xmm0,0x08(%2)                 \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%4                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "pmaddwd   %%xmm5,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm2                   \n"
+      "movd      %%xmm0,%k1                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x04(%2),%2                     \n"
+      "sub       $0x1,%4                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "sub       %1,%0                           \n"
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
+
+      "movd      %3,%%xmm0                       \n"
+      "neg       %3                              \n"
+      "add       $0x100,%3                       \n"
+      "movd      %3,%%xmm5                       \n"
+      "punpcklbw %%xmm0,%%xmm5                   \n"
+      "punpcklwd %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x80808080,%%eax               \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "punpcklbw  %%xmm2,%%xmm0                  \n"
+      "punpckhbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm4,%%xmm0                  \n"
+      "psubb      %%xmm4,%%xmm1                  \n"
+      "movdqa     %%xmm5,%%xmm2                  \n"
+      "movdqa     %%xmm5,%%xmm3                  \n"
+      "pmaddubsw  %%xmm0,%%xmm2                  \n"
+      "pmaddubsw  %%xmm1,%%xmm3                  \n"
+      "paddw      %%xmm4,%%xmm2                  \n"
+      "paddw      %%xmm4,%%xmm3                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       99f                             \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        100b                            \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  asm volatile(
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "sub       %1,%0                           \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
+
+      "vmovd      %3,%%xmm0                      \n"
+      "neg        %3                             \n"
+      "add        $0x100,%3                      \n"
+      "vmovd      %3,%%xmm5                      \n"
+      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm4                   \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%1),%%ymm0                    \n"
+      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "jmp        99f                            \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu   (%1),%%ymm0                     \n"
+      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
+      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "rep movsb                                 \n"
+      "jmp       999f                            \n"
+
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      "999:                                      \n"
+      : "+D"(dst_ptr),               // %0
+        "+S"(src_ptr),               // %1
+        "+cm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width) {
+  asm volatile(
+
+      "movdqu    (%3),%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "add       $0x10,%0                          \n"
+      "movdqa    %%xmm0,%%xmm1                     \n"
+      "punpcklbw %%xmm2,%%xmm0                     \n"
+      "punpckhbw %%xmm2,%%xmm1                     \n"
+      "movdqu    %%xmm0,(%3)                       \n"
+      "movdqu    %%xmm1,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "movdqa    %%xmm2,%%xmm1                     \n"
+      "add       $0x10,%0                          \n"
+      "punpcklbw %%xmm0,%%xmm1                     \n"
+      "punpckhbw %%xmm0,%%xmm2                     \n"
+      "movdqu    %%xmm1,(%3)                       \n"
+      "movdqu    %%xmm2,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
+      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_AVX2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width) {
+  asm volatile(
+
+      "pxor      %%xmm3,%%xmm3                   \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm3,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm4                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "mulps     0x10(%3),%%xmm0                 \n"
+      "mulps     0x10(%3),%%xmm4                 \n"
+      "addps     (%3),%%xmm0                     \n"
+      "addps     (%3),%%xmm4                     \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm1,%%xmm2                   \n"
+      "mulps     %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm2,%%xmm1                   \n"
+      "mulps     %%xmm6,%%xmm5                   \n"
+      "mulps     0x20(%3),%%xmm2                 \n"
+      "mulps     0x20(%3),%%xmm6                 \n"
+      "mulps     0x30(%3),%%xmm1                 \n"
+      "mulps     0x30(%3),%%xmm5                 \n"
+      "addps     %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm6,%%xmm4                   \n"
+      "addps     %%xmm1,%%xmm0                   \n"
+      "addps     %%xmm5,%%xmm4                   \n"
+      "cvttps2dq %%xmm0,%%xmm0                   \n"
+      "cvttps2dq %%xmm4,%%xmm4                   \n"
+      "packuswb  %%xmm4,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x2,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width) {
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
+      "add        $0x20,%0                       \n"
+      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
+      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
+      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
+      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
+      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
+      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
+      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
+      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
+      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+  asm volatile(
+      "sub        %0,%1                          \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width) {
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "movzb     -0x1(%0),%1                     \n"
+      "movzb     0x03(%3,%1,4),%1                \n"
+      "mov       %b1,-0x1(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
+                                 int width,
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff) {
+  uintptr_t pixel_temp;
+  uintptr_t table_temp;
+  asm volatile(
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0x8,%%xmm4                     \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%2),%%xmm0                     \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "phaddw    %%xmm0,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     (%2),%0                         \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,(%3)                        \n"
+      "movzb     0x1(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x1(%3)                     \n"
+      "movzb     0x2(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x2(%3)                     \n"
+      "movzb     0x3(%2),%0                      \n"
+      "mov       %b0,0x3(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     0x4(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x4(%3)                     \n"
+      "movzb     0x5(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x5(%3)                     \n"
+      "movzb     0x6(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x6(%3)                     \n"
+      "movzb     0x7(%2),%0                      \n"
+      "mov       %b0,0x7(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     0x8(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x8(%3)                     \n"
+      "movzb     0x9(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x9(%3)                     \n"
+      "movzb     0xa(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xa(%3)                     \n"
+      "movzb     0xb(%2),%0                      \n"
+      "mov       %b0,0xb(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+
+      "movzb     0xc(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xc(%3)                     \n"
+      "movzb     0xd(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xd(%3)                     \n"
+      "movzb     0xe(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xe(%3)                     \n"
+      "movzb     0xf(%2),%0                      \n"
+      "mov       %b0,0xf(%3)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "lea       0x10(%3),%3                     \n"
+      "sub       $0x4,%4                         \n"
+      "jg        1b                              \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_msa.cc b/media/libyuv/libyuv/source/row_msa.cc
new file mode 100644
index 0000000000..66666cefcd
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_msa.cc
@@ -0,0 +1,3512 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+  {                                                              \
+    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
+    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
+    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
+    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
+    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
+    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
+    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
+  }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \
+  {                                                                \
+    uint64_t y_m;                                                  \
+    uint32_t u_m, v_m;                                             \
+    v4i32 zero_m = {0};                                            \
+    y_m = LD(psrc_y);                                              \
+    u_m = LW(psrc_u);                                              \
+    v_m = LW(psrc_v);                                              \
+    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
+    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \
+    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \
+  }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+  {                                               \
+    v4i32 max_m = __msa_ldi_w(0xFF);              \
+                                                  \
+    in0 = __msa_maxi_s_w(in0, 0);                 \
+    in1 = __msa_maxi_s_w(in1, 0);                 \
+    in2 = __msa_maxi_s_w(in2, 0);                 \
+    in3 = __msa_maxi_s_w(in3, 0);                 \
+    in4 = __msa_maxi_s_w(in4, 0);                 \
+    in5 = __msa_maxi_s_w(in5, 0);                 \
+    in0 = __msa_min_s_w(max_m, in0);              \
+    in1 = __msa_min_s_w(max_m, in1);              \
+    in2 = __msa_min_s_w(max_m, in2);              \
+    in3 = __msa_min_s_w(max_m, in3);              \
+    in4 = __msa_min_s_w(max_m, in4);              \
+    in5 = __msa_min_s_w(max_m, in5);              \
+  }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+  {                                                                            \
+    v8i16 vec0_m, vec1_m;                                                      \
+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
+    v4i32 reg5_m, reg6_m, reg7_m;                                              \
+    v16i8 zero_m = {0};                                                        \
+                                                                               \
+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg0_m *= yg;                                                              \
+    reg1_m *= yg;                                                              \
+    reg2_m *= ubvr;                                                            \
+    reg3_m *= ubvr;                                                            \
+    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
+    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
+    reg5_m = reg0_m - reg5_m;                                                  \
+    reg6_m = reg1_m - reg6_m;                                                  \
+    reg2_m = reg0_m - reg2_m;                                                  \
+    reg3_m = reg1_m - reg3_m;                                                  \
+    reg7_m = reg0_m - reg7_m;                                                  \
+    reg4_m = reg1_m - reg4_m;                                                  \
+    reg5_m += bb;                                                              \
+    reg6_m += bb;                                                              \
+    reg7_m += bg;                                                              \
+    reg4_m += bg;                                                              \
+    reg2_m += br;                                                              \
+    reg3_m += br;                                                              \
+    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
+    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
+    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
+    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
+    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
+    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
+  {                                                        \
+    v8i16 vec0_m, vec1_m;                                  \
+    v16u8 dst0_m, dst1_m;                                  \
+    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
+    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
+    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
+  }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+                y_out)                                                     \
+  {                                                                        \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
+    v8u16 reg0_m, reg1_m;                                                  \
+                                                                           \
+    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
+    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
+    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
+    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
+    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
+    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
+    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
+    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
+    reg0_m += const2;                                                      \
+    reg1_m += const2;                                                      \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
+    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
+  }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+  {                                                                       \
+    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+    v16u8 vec8_m, vec9_m;                                                 \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+    v8u16 reg8_m, reg9_m;                                                 \
+                                                                          \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
+    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
+    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
+    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
+    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                          \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+                                                                             \
+    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
+    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
+    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
+    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
+    reg0_m += const3;                                                        \
+    reg1_m += const3;                                                        \
+    reg2_m += const3;                                                        \
+    reg3_m += const3;                                                        \
+    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
+    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
+    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
+    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
+    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
+    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+  }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+  {                                                           \
+    uint64_t y_m, u_m, v_m;                                   \
+    v2i64 zero_m = {0};                                       \
+    y_m = LD(psrc_y);                                         \
+    u_m = LD(psrc_u);                                         \
+    v_m = LD(psrc_v);                                         \
+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \
+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \
+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
+  }
+
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  src += width - 64;
+
+  for (x = 0; x < width; x += 64) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+  src += width * 4 - 64;
+
+  for (x = 0; x < width; x += 16) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int64_t data_a;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v4i32 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    data_a = LD(src_a);
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+    STOREARGB(vec0, vec1, vec2, src3, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    src_a += 8;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int64_t data_u, data_v;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 reg0, reg1, reg2, reg3;
+  v2i64 zero = {0};
+  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
+                     11, 29, 12, 13, 30, 14, 15, 31};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+    data_u = LD(src_u);
+    data_v = LD(src_v);
+    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec3, vec4, vec5);
+    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    ST_UB(dst2, (dst_argb + 32));
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 48;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec2, vec1);
+    vec0 = __msa_srai_h(vec0, 3);
+    vec1 = __msa_srai_h(vec1, 3);
+    vec2 = __msa_srai_h(vec2, 2);
+    vec1 = __msa_slli_h(vec1, 11);
+    vec2 = __msa_slli_h(vec2, 5);
+    vec0 |= vec1;
+    dst0 = (v16u8)(vec2 | vec0);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_rgb565 += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 4);
+    reg1 = (v8u16)__msa_srai_h(vec1, 4);
+    reg2 = (v8u16)__msa_srai_h(vec2, 4);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+    reg1 |= const_0xF000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb4444);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb4444 += 16;
+  }
+}
+
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 3);
+    reg1 = (v8u16)__msa_srai_h(vec1, 3);
+    reg2 = (v8u16)__msa_srai_h(vec2, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+    reg1 |= const_0x8000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb1555);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb1555 += 16;
+  }
+}
+
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16i8 zero = {0};
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v16u8 dst0, dst1;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 = __msa_hadd_u_h(vec8, vec8);
+    reg1 = __msa_hadd_u_h(vec9, vec9);
+    reg2 = __msa_hadd_u_h(vec4, vec4);
+    reg3 = __msa_hadd_u_h(vec5, vec5);
+    reg4 = __msa_hadd_u_h(vec0, vec0);
+    reg5 = __msa_hadd_u_h(vec1, vec1);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 += __msa_hadd_u_h(vec8, vec8);
+    reg1 += __msa_hadd_u_h(vec9, vec9);
+    reg2 += __msa_hadd_u_h(vec4, vec4);
+    reg3 += __msa_hadd_u_h(vec5, vec5);
+    reg4 += __msa_hadd_u_h(vec0, vec0);
+    reg5 += __msa_hadd_u_h(vec1, vec1);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg6 = reg0 * const_0x70;
+    reg7 = reg1 * const_0x70;
+    reg8 = reg2 * const_0x4A;
+    reg9 = reg3 * const_0x4A;
+    reg6 += const_0x8080;
+    reg7 += const_0x8080;
+    reg8 += reg4 * const_0x26;
+    reg9 += reg5 * const_0x26;
+    reg0 *= const_0x12;
+    reg1 *= const_0x12;
+    reg2 *= const_0x5E;
+    reg3 *= const_0x5E;
+    reg4 *= const_0x70;
+    reg5 *= const_0x70;
+    reg2 += reg0;
+    reg3 += reg1;
+    reg4 += const_0x8080;
+    reg5 += const_0x8080;
+    reg6 -= reg8;
+    reg7 -= reg9;
+    reg4 -= reg2;
+    reg5 -= reg3;
+    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb0 += 128;
+    src_argb0_next += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
+                     16, 17, 18, 20, 21, 22, 24, 25};
+  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+                     21, 22, 24, 25, 26, 28, 29, 30};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
+                     18, 17, 16, 22, 21, 20, 26, 25};
+  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
+                     21, 20, 26, 25, 24, 30, 29, 28};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 4);
+    vec4 = __msa_binsli_b(vec4, vec5, 2);
+    vec5 = __msa_binsli_b(vec6, vec7, 4);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec5 = __msa_binsli_b(vec5, vec6, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 5);
+    vec6 = __msa_binsli_b(vec7, vec8, 5);
+    vec1 = __msa_binsli_b(vec1, vec4, 0);
+    vec6 = __msa_binsli_b(vec6, vec9, 0);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v16u8 vec0, vec1;
+  v16u8 dst0;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+    vec0 = __msa_binsli_b(vec0, src0, 3);
+    vec1 = __msa_binsli_b(vec1, src1, 3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int32_t x;
+  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11;
+  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+  v16i8 zero = {0};
+
+  for (x = width; x > 0; x -= 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec10 = vec0 * const_18;
+    vec11 = vec1 * const_18;
+    vec8 = vec2 * const_94;
+    vec9 = vec3 * const_94;
+    vec6 = vec4 * const_112;
+    vec7 = vec5 * const_112;
+    vec0 *= const_112;
+    vec1 *= const_112;
+    vec2 *= const_74;
+    vec3 *= const_74;
+    vec4 *= const_38;
+    vec5 *= const_38;
+    vec8 += vec10;
+    vec9 += vec11;
+    vec6 += const_32896;
+    vec7 += const_32896;
+    vec0 += const_32896;
+    vec1 += const_32896;
+    vec2 += vec4;
+    vec3 += vec5;
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6 -= vec8;
+    vec7 -= vec9;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+    dst0 = __msa_subs_u_b(src0, src2);
+    dst1 = __msa_subs_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 zero = {0};
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+    vec4 = (v8u16)__msa_fill_h(vec0[3]);
+    vec5 = (v8u16)__msa_fill_h(vec0[7]);
+    vec6 = (v8u16)__msa_fill_h(vec1[3]);
+    vec7 = (v8u16)__msa_fill_h(vec1[7]);
+    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec6 = (v8u16)__msa_fill_h(vec2[3]);
+    vec7 = (v8u16)__msa_fill_h(vec2[7]);
+    vec8 = (v8u16)__msa_fill_h(vec3[3]);
+    vec9 = (v8u16)__msa_fill_h(vec3[7]);
+    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, src0, mask);
+    dst1 = __msa_bmnz_v(dst1, src1, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  v16u8 src0, src1, dst0, vec0, vec1;
+  v8i16 vec_d0;
+  v8i16 reg0, reg1, reg2;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(0xFF);
+
+  vec_d0 = (v8i16)__msa_fill_w(dither4);
+  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg0 += vec_d0;
+    reg1 += vec_d0;
+    reg2 += vec_d0;
+    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+    reg0 = __msa_srai_h(reg0, 3);
+    reg2 = __msa_srai_h(reg2, 3);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_slli_h(reg2, 11);
+    reg1 = __msa_slli_h(reg1, 5);
+    reg0 |= reg1;
+    dst0 = (v16u8)(reg0 | reg2);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v16i8 vec0;
+  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+  int32_t val = LW((int32_t*)shuffler);
+
+  vec0 = (v16i8)__msa_fill_w(val);
+  shuffler_vec += vec0;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  v16u8 src0, dst0;
+  v8u16 vec0, vec1;
+  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+  v8i16 zero = {0};
+
+  rgba_scale[0] = value;
+  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= rgba_scale;
+    reg1 *= rgba_scale;
+    reg2 *= rgba_scale;
+    reg3 *= rgba_scale;
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0, dst1;
+  v8u16 reg0;
+  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2;
+  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
+    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb4444 += 32;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v8u16 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+  v16u8 dst0, dst1, dst2, dst3;
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+    reg3 = -reg3;
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb1555 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb565 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb24 += 48;
+    dst_argb += 64;
+  }
+}
+
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_raw += 48;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 dst0;
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb1555 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v4u32 res0, res1, res2, res3;
+  v16u8 dst0;
+  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+  v8i16 const_0x1080 = __msa_fill_h(0x1080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_rgb565 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_argb1555;
+  const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    vec2 += src2 & const_0x1F;
+    vec3 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+    reg0 = vec6 * const_0x70;
+    reg1 = vec0 * const_0x4A;
+    reg2 = vec2 * const_0x70;
+    reg3 = vec0 * const_0x5E;
+    reg0 += const_0x8080;
+    reg1 += vec2 * const_0x26;
+    reg2 += const_0x8080;
+    reg3 += vec6 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_rgb565;
+  const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x3F;
+    vec3 = src1 & const_0x3F;
+    vec2 += src2 & const_0x3F;
+    vec3 += src3 & const_0x3F;
+    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    reg0 = vec3 * const_0x70;
+    reg1 = vec1 * const_0x4A;
+    reg2 = vec4 * const_0x70;
+    reg3 = vec1 * const_0x5E;
+    reg0 += const_32896;
+    reg1 += vec4 * const_0x26;
+    reg2 += const_32896;
+    reg3 += vec3 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h((v8i16)reg0, 2);
+    reg1 = __msa_srai_h((v8i16)reg1, 2);
+    reg2 = __msa_srai_h((v8i16)reg2, 2);
+    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h(reg0, 2);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_srai_h(reg2, 2);
+    reg3 = __msa_srai_h(reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_uv += 8;
+    dst_argb += 32;
+  }
+}
+
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    vec0 = vec0 >> 3;
+    vec1 = (vec1 >> 2) << 5;
+    vec2 = (vec2 >> 3) << 11;
+    dst0 = (v16u8)(vec0 | vec1 | vec2);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_uv += 8;
+    dst_rgb565 += 16;
+  }
+}
+
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16u8 zero = {0};
+  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_vu);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_vu += 8;
+    dst_argb += 32;
+  }
+}
+
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  int x;
+  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+  v16i8 const_0x4 = __msa_ldi_b(0x4);
+  v16i8 mask1 = mask0 + const_0x4;
+  v16i8 mask2 = mask1 + const_0x4;
+  v16i8 mask3 = mask2 + const_0x4;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_sobelx += 32;
+    src_sobely += 32;
+    dst_y += 32;
+  }
+}
+
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, vec2;
+  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec0 = __msa_aver_u_b(src4, src6);
+    vec1 = __msa_aver_u_b(src5, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec2 = __msa_aver_u_b(src4, src6);
+    vec3 = __msa_aver_u_b(src5, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, src0, src1, src2, src3);
+    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0, dst1;
+  v8u16 vec0, vec1, vec2;
+  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+
+  for (x = 0; x < width; x += 8) {
+    READI444(src_y, src_u, src_v, src0, src1, src2);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg4 = reg0 + vec_br;
+    reg5 = reg1 + vec_br;
+    reg2 = reg0 + vec_bg;
+    reg3 = reg1 + vec_bg;
+    reg0 += vec_bb;
+    reg1 += vec_bb;
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    reg0 -= reg6 * vec_ub;
+    reg1 -= reg7 * vec_ub;
+    reg2 -= reg6 * vec_ug;
+    reg3 -= reg7 * vec_ug;
+    reg4 -= reg8 * vec_vr;
+    reg5 -= reg9 * vec_vr;
+    reg2 -= reg8 * vec_vg;
+    reg3 -= reg9 * vec_vg;
+    reg0 = __msa_srai_w(reg0, 6);
+    reg1 = __msa_srai_w(reg1, 6);
+    reg2 = __msa_srai_w(reg2, 6);
+    reg3 = __msa_srai_w(reg3, 6);
+    reg4 = __msa_srai_w(reg4, 6);
+    reg5 = __msa_srai_w(reg5, 6);
+    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 32;
+  }
+}
+
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1;
+  v4i32 reg0, reg1, reg2, reg3;
+  v4i32 vec_yg = __msa_fill_w(0x4A35);
+  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 max = __msa_ldi_h(0xFF);
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg2 *= vec_yg;
+    reg3 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg2 = __msa_srai_w(reg2, 16);
+    reg3 = __msa_srai_w(reg3, 16);
+    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec0 += vec_ygb;
+    vec1 += vec_ygb;
+    vec0 = __msa_srai_h(vec0, 6);
+    vec1 = __msa_srai_h(vec1, 6);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_yuy2 += 16;
+    dst_argb += 32;
+  }
+}
+
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_uyvy += 16;
+    dst_argb += 32;
+  }
+}
+
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int32_t source_y_fraction) {
+  int32_t y1_fraction = source_y_fraction;
+  int32_t y0_fraction = 256 - y1_fraction;
+  uint16_t y_fractions;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+  if (0 == y1_fraction) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+
+  if (128 == y1_fraction) {
+    for (x = 0; x < width; x += 32) {
+      src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+      src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+      src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+      src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+      dst0 = __msa_aver_u_b(src0, src2);
+      dst1 = __msa_aver_u_b(src1, src3);
+      ST_UB2(dst0, dst1, dst_ptr, 16);
+      s += 32;
+      t += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+  y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst_ptr, 16);
+    s += 32;
+    t += 32;
+    dst_ptr += 32;
+  }
+}
+
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
+  int x;
+  v4i32 dst0 = __builtin_msa_fill_w(v32);
+
+  for (x = 0; x < width; x += 4) {
+    ST_UB(dst0, dst_argb);
+    dst_argb += 16;
+  }
+}
+
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
+                     18, 17, 16, 21, 20, 19, 24, 23};
+  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+                     24, 23, 28, 27, 26, 31, 30, 29};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_rgb24, 16);
+    ST_UB(dst2, (dst_rgb24 + 32));
+    src_raw += 48;
+    dst_rgb24 += 48;
+  }
+}
+
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+    ST_UB2(dst0, dst1, dst_uv, 16);
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+  }
+}
+
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  int i;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+
+  for (i = 0; i < width; i += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_a);
+    src_argb += 64;
+    dst_a += 16;
+  }
+}
+
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
+  v8u16 const_256 = (v8u16)__msa_ldi_h(256);
+  v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
+    vec8 = (v8u16)__msa_fill_h(vec0[3]);
+    vec9 = (v8u16)__msa_fill_h(vec0[7]);
+    vec10 = (v8u16)__msa_fill_h(vec1[3]);
+    vec11 = (v8u16)__msa_fill_h(vec1[7]);
+    vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec10 = (v8u16)__msa_fill_h(vec2[3]);
+    vec11 = (v8u16)__msa_fill_h(vec2[7]);
+    vec12 = (v8u16)__msa_fill_h(vec3[3]);
+    vec13 = (v8u16)__msa_fill_h(vec3[7]);
+    vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
+    vec8 = const_256 - vec8;
+    vec9 = const_256 - vec9;
+    vec10 = const_256 - vec10;
+    vec11 = const_256 - vec11;
+    vec8 *= vec4;
+    vec9 *= vec5;
+    vec10 *= vec6;
+    vec11 *= vec7;
+    vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
+    vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
+    vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
+    vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
+    vec0 += vec8;
+    vec1 += vec9;
+    vec2 += vec10;
+    vec3 += vec11;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, const_255, mask);
+    dst1 = __msa_bmnz_v(dst1, const_255, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v4i32 vec_scale = __msa_fill_w(scale);
+  v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
+  v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
+  v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);
+    vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
+    tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
+    tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
+    tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
+    tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
+    tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
+    tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
+    tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
+    tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
+    tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
+    tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
+    tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
+    tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
+    tmp0 *= vec_scale;
+    tmp1 *= vec_scale;
+    tmp2 *= vec_scale;
+    tmp3 *= vec_scale;
+    tmp4 *= vec_scale;
+    tmp5 *= vec_scale;
+    tmp6 *= vec_scale;
+    tmp7 *= vec_scale;
+    tmp8 *= vec_scale;
+    tmp9 *= vec_scale;
+    tmp10 *= vec_scale;
+    tmp11 *= vec_scale;
+    tmp12 *= vec_scale;
+    tmp13 *= vec_scale;
+    tmp14 *= vec_scale;
+    tmp15 *= vec_scale;
+    tmp0 >>= 16;
+    tmp1 >>= 16;
+    tmp2 >>= 16;
+    tmp3 >>= 16;
+    tmp4 >>= 16;
+    tmp5 >>= 16;
+    tmp6 >>= 16;
+    tmp7 >>= 16;
+    tmp8 >>= 16;
+    tmp9 >>= 16;
+    tmp10 >>= 16;
+    tmp11 >>= 16;
+    tmp12 >>= 16;
+    tmp13 >>= 16;
+    tmp14 >>= 16;
+    tmp15 >>= 16;
+    vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    dst0 *= vec_int_sz;
+    dst1 *= vec_int_sz;
+    dst2 *= vec_int_sz;
+    dst3 *= vec_int_sz;
+    dst0 += vec_int_ofst;
+    dst1 += vec_int_ofst;
+    dst2 += vec_int_ofst;
+    dst3 += vec_int_ofst;
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    dst_argb += 64;
+  }
+}
+
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width) {
+  int32_t x;
+  v16i8 src0;
+  v16u8 src1, src2, dst0, dst1;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  src0 = __msa_ld_b((v16i8*)matrix_argb, 0);
+  vec0 = (v8i16)__msa_ilvr_b(zero, src0);
+  vec1 = (v8i16)__msa_ilvl_b(zero, src0);
+
+  for (x = 0; x < width; x += 8) {
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
+    vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
+    vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
+    vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
+    vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
+    vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
+    vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
+    vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
+    vec10 = vec2 * vec0;
+    vec11 = vec2 * vec1;
+    vec12 = vec6 * vec0;
+    vec13 = vec6 * vec1;
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    vec14 = vec3 * vec0;
+    vec15 = vec3 * vec1;
+    vec16 = vec7 * vec0;
+    vec17 = vec7 * vec1;
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    tmp0 = __msa_srai_w(tmp0, 6);
+    tmp1 = __msa_srai_w(tmp1, 6);
+    tmp2 = __msa_srai_w(tmp2, 6);
+    tmp3 = __msa_srai_w(tmp3, 6);
+    vec2 = vec4 * vec0;
+    vec6 = vec4 * vec1;
+    vec3 = vec8 * vec0;
+    vec7 = vec8 * vec1;
+    tmp8 = __msa_hadd_s_w(vec2, vec2);
+    tmp9 = __msa_hadd_s_w(vec6, vec6);
+    tmp10 = __msa_hadd_s_w(vec3, vec3);
+    tmp11 = __msa_hadd_s_w(vec7, vec7);
+    vec4 = vec5 * vec0;
+    vec8 = vec5 * vec1;
+    vec5 = vec9 * vec0;
+    vec9 = vec9 * vec1;
+    tmp12 = __msa_hadd_s_w(vec4, vec4);
+    tmp13 = __msa_hadd_s_w(vec8, vec8);
+    tmp14 = __msa_hadd_s_w(vec5, vec5);
+    tmp15 = __msa_hadd_s_w(vec9, vec9);
+    vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    tmp4 = __msa_srai_w(tmp4, 6);
+    tmp5 = __msa_srai_w(tmp5, 6);
+    tmp6 = __msa_srai_w(tmp6, 6);
+    tmp7 = __msa_srai_w(tmp7, 6);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec10 = __msa_maxi_s_h(vec10, 0);
+    vec11 = __msa_maxi_s_h(vec11, 0);
+    vec12 = __msa_maxi_s_h(vec12, 0);
+    vec13 = __msa_maxi_s_h(vec13, 0);
+    vec10 = __msa_min_s_h(vec10, max);
+    vec11 = __msa_min_s_h(vec11, max);
+    vec12 = __msa_min_s_h(vec12, max);
+    vec13 = __msa_min_s_h(vec13, max);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_u, 16);
+    ST_UB2(dst2, dst3, dst_v, 16);
+    src_uv += 64;
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
+  int x;
+  v16u8 dst0 = (v16u8)__msa_fill_b(v8);
+
+  for (x = 0; x < width; x += 16) {
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
+  v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
+
+  src_uv += (2 * width);
+
+  for (x = 0; x < width; x += 32) {
+    src_uv -= 64;
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_v, 16);
+    ST_UB2(dst2, dst3, dst_u, 16);
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
+  v16i8 tmp = __msa_ldi_b(8);
+  v16i8 mask1 = mask0 + tmp;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16);
+    src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16);
+    vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
+    vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobelx);
+    src_y0 += 16;
+    src_y1 += 16;
+    src_y2 += 16;
+    dst_sobelx += 16;
+  }
+}
+
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6[0] = src_y0[16] - src_y1[16];
+    vec6[1] = src_y0[17] - src_y1[17];
+    vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
+    vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
+    vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
+    vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobely);
+    src_y0 += 16;
+    src_y1 += 16;
+    dst_sobely += 16;
+  }
+}
+
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width) {
+  int i;
+  v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
+  v4f32 mult_vec;
+  v8i16 zero = {0};
+  mult_vec[0] = 1.9259299444e-34f * scale;
+  mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
+
+  for (i = 0; i < width; i += 32) {
+    src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
+    src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
+    src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
+    src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
+    vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
+    vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
+    vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
+    vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
+    vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
+    vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
+    vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
+    vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
+    fvec0 = __msa_ffint_u_w(vec0);
+    fvec1 = __msa_ffint_u_w(vec1);
+    fvec2 = __msa_ffint_u_w(vec2);
+    fvec3 = __msa_ffint_u_w(vec3);
+    fvec4 = __msa_ffint_u_w(vec4);
+    fvec5 = __msa_ffint_u_w(vec5);
+    fvec6 = __msa_ffint_u_w(vec6);
+    fvec7 = __msa_ffint_u_w(vec7);
+    fvec0 *= mult_vec;
+    fvec1 *= mult_vec;
+    fvec2 *= mult_vec;
+    fvec3 *= mult_vec;
+    fvec4 *= mult_vec;
+    fvec5 *= mult_vec;
+    fvec6 *= mult_vec;
+    fvec7 *= mult_vec;
+    vec0 = ((v4u32)fvec0) >> 13;
+    vec1 = ((v4u32)fvec1) >> 13;
+    vec2 = ((v4u32)fvec2) >> 13;
+    vec3 = ((v4u32)fvec3) >> 13;
+    vec4 = ((v4u32)fvec4) >> 13;
+    vec5 = ((v4u32)fvec5) >> 13;
+    vec6 = ((v4u32)fvec6) >> 13;
+    vec7 = ((v4u32)fvec7) >> 13;
+    dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
+    dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    ST_UH2(dst0, dst1, dst, 8);
+    ST_UH2(dst2, dst3, dst + 16, 8);
+    src += 32;
+    dst += 32;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libyuv/libyuv/source/row_neon.cc b/media/libyuv/libyuv/source/row_neon.cc
new file mode 100644
index 0000000000..8b6c195207
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_neon.cc
@@ -0,0 +1,2693 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.32    {d2[0]}, [%1]!                 \n" \
+  "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.8     {d2}, [%1]!                    \n" \
+  "vld1.8     {d3}, [%2]!                    \n" \
+  "vpaddl.u8  q1, q1                         \n" \
+  "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d2, d3                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d3, d2                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                 \
+  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                 \
+  "vld2.8     {d2, d3}, [%0]!                \n" \
+  "vmov.u8    d0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
+
+#define YUVTORGB_SETUP                             \
+  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
+  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
+  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
+  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
+  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUVTORGB                                                              \
+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
+  "vmovl.s16  q10, d1                        \n"                              \
+  "vmovl.s16  q0, d0                         \n"                              \
+  "vmul.s32   q10, q10, q15                  \n"                              \
+  "vmul.s32   q0, q0, q15                    \n"                              \
+  "vqshrun.s32 d0, q0, #16                   \n"                              \
+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
+  "vadd.s16   d18, d19                       \n"                              \
+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
+  "vaddw.u16  q1, q1, d16                    \n"                              \
+  "vaddw.u16  q10, q10, d17                  \n"                              \
+  "vaddw.u16  q3, q3, d18                    \n"                              \
+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
+  "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV444 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %5, %5, #8                     \n"
+      "vld1.8     {d23}, [%3]!                   \n"
+      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
+      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_rgba),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%3]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_rgb24),  // %3
+        "+r"(width)       // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
+  "vorr       d0, d20, d21                   \n" /* BG                   */ \
+  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
+      "1:                                        \n"
+
+      READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb4444),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
+        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
+        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
+        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n"
+      "vld1.8     {d20}, [%0]!                   \n"
+      "vmov       d21, d20                       \n"
+      "vmov       d22, d20                       \n"
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV12 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_uv),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV21 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_vu),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_uv),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV21 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READYUY2 YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_yuy2),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READUYVY YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_uyvy),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store U
+      "vst1.8     {q1}, [%2]!                    \n"  // store V
+      "bgt        1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load U
+      "vld1.8     {q1}, [%1]!                    \n"  // load V
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+      "bgt        1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
+      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store R
+      "vst1.8     {q1}, [%2]!                    \n"  // store G
+      "vst1.8     {q2}, [%3]!                    \n"  // store B
+      "bgt        1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load R
+      "vld1.8     {q1}, [%1]!                    \n"  // load G
+      "vld1.8     {q2}, [%2]!                    \n"  // load B
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
+      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
+      "bgt        1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+      "subs       %2, %2, #32                    \n"  // 32 processed per loop
+      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+      "bgt        1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2                     \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #16                        \n"  // 16 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r12, #-16                      \n"
+      "add        %0, %0, %3, lsl #1             \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+      "subs       %3, #8                         \n"  // 8 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+      "vst1.8     {d1}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2, lsl #2             \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #4                         \n"  // 4 pixels per loop.
+      "vrev64.32  q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_uyvy
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),     // %0
+        "+r"(stride_uyvy),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+      "subs       %2, %2, #4                     \n"  // 4 processed per loop
+      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vdup.32    d2, %2                         \n"  // dither4
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d20, d20, d2                   \n"
+      "vqadd.u8   d21, d21, d2                   \n"
+      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
+      ARGBTORGB565
+      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
+      "bgt        1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlsl.u8   q2, d1, d25                    \n"  // G
+      "vmlsl.u8   q2, d2, d26                    \n"  // R
+      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+      "vmull.u8   q3, d2, d24                    \n"  // R
+      "vmlsl.u8   q3, d1, d28                    \n"  // G
+      "vmlsl.u8   q3, d0, d27                    \n"  // B
+      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+        "q15");
+}
+
+// clang-format off
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    "1:                                        \n"
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),         // %0
+        "+r"(src_stride_rgb565),  // %1
+        "+r"(dst_u),              // %2
+        "+r"(dst_v),              // %3
+        "+r"(width)               // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),         // %0
+        "+r"(src_stride_argb1555),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),         // %0
+        "+r"(src_stride_argb4444),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // R
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // R
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // B
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  asm volatile(
+      "cmp        %4, #0                         \n"
+      "beq        100f                           \n"
+      "add        %2, %1                         \n"
+      "cmp        %4, #128                       \n"
+      "beq        50f                            \n"
+
+      "vdup.8     d5, %4                         \n"
+      "rsb        %4, #256                       \n"
+      "vdup.8     d4, %4                         \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vmull.u8   q13, d0, d4                    \n"
+      "vmull.u8   q14, d1, d4                    \n"
+      "vmlal.u8   q13, d2, d5                    \n"
+      "vmlal.u8   q14, d3, d5                    \n"
+      "vrshrn.u16 d0, q13, #8                    \n"
+      "vrshrn.u16 d1, q14, #8                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        1b                             \n"
+      "b          99f                            \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vrhadd.u8  q0, q1                         \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        50b                            \n"
+      "b          99f                            \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        100b                           \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_stride),  // %2
+        "+r"(dst_width),   // %3
+        "+r"(y1_fraction)  // %4
+      :
+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %3, #8                         \n"
+      "blt        89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"  // db * a
+      "vmull.u8   q11, d5, d3                    \n"  // dg * a
+      "vmull.u8   q12, d6, d3                    \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"  // + sr
+      "vmov.u8    d3, #255                       \n"  // a = 255
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+      "bge        8b                             \n"
+
+      "89:                                       \n"
+      "adds       %3, #8-1                       \n"
+      "blt        99f                            \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"    // db * a
+      "vmull.u8   q11, d5, d3                    \n"    // dg * a
+      "vmull.u8   q12, d6, d3                    \n"    // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"    // + sr
+      "vmov.u8    d3, #255                       \n"    // a = 255
+      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge        1b                             \n"
+
+      "99:                                         \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d0, d3                    \n"  // b * a
+      "vmull.u8   q11, d1, d3                    \n"  // g * a
+      "vmull.u8   q12, d2, d3                    \n"  // r * a
+      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "vdup.u16   q8, %2                         \n"
+      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+      "vdup.u16   q9, %3                         \n"  // interval multiply.
+      "vdup.u16   q10, %4                        \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+      "vmovl.u8   q1, d2                         \n"
+      "vmovl.u8   q2, d4                         \n"
+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+      "vqdmulh.s16 q1, q1, q8                    \n"  // g
+      "vqdmulh.s16 q2, q2, q8                    \n"  // r
+      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+      "vmul.u16   q1, q1, q9                     \n"  // g
+      "vmul.u16   q2, q2, q9                     \n"  // r
+      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+      "vadd.u16   q1, q1, q10                    \n"  // g
+      "vadd.u16   q2, q2, q10                    \n"  // r
+      "vqmovn.u16 d0, q0                         \n"
+      "vqmovn.u16 d2, q1                         \n"
+      "vqmovn.u16 d4, q2                         \n"
+      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+      "vmovl.u8   q11, d22                       \n"
+      "vmovl.u8   q12, d24                       \n"
+      "vmovl.u8   q13, d26                       \n"
+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+      "vqmovn.u16 d20, q10                       \n"
+      "vqmovn.u16 d22, q11                       \n"
+      "vqmovn.u16 d24, q12                       \n"
+      "vqmovn.u16 d26, q13                       \n"
+      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+      "vmov       d1, d0                         \n"  // G
+      "vmov       d2, d0                         \n"  // R
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d20, #17                       \n"  // BB coefficient
+      "vmov.u8    d21, #68                       \n"  // BG coefficient
+      "vmov.u8    d22, #35                       \n"  // BR coefficient
+      "vmov.u8    d24, #22                       \n"  // GB coefficient
+      "vmov.u8    d25, #88                       \n"  // GG coefficient
+      "vmov.u8    d26, #45                       \n"  // GR coefficient
+      "vmov.u8    d28, #24                       \n"  // BB coefficient
+      "vmov.u8    d29, #98                       \n"  // BG coefficient
+      "vmov.u8    d30, #50                       \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+      "vmlal.u8   q2, d1, d21                    \n"  // G
+      "vmlal.u8   q2, d2, d22                    \n"  // R
+      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+      "vmlal.u8   q3, d1, d25                    \n"  // G
+      "vmlal.u8   q3, d2, d26                    \n"  // R
+      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+      "vmlal.u8   q8, d1, d29                    \n"  // G
+      "vmlal.u8   q8, d2, d30                    \n"  // R
+      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+        "q14", "q15");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8   q9, d18                        \n"  // g
+      "vmovl.u8   q10, d20                       \n"  // r
+      "vmovl.u8   q11, d22                       \n"  // a
+      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q0, d0, d1                     \n"  // multiply B
+      "vmull.u8   q1, d2, d3                     \n"  // multiply G
+      "vmull.u8   q2, d4, d5                     \n"  // multiply R
+      "vmull.u8   q3, d6, d7                     \n"  // multiply A
+      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d0, d0, d1                     \n"  // add
+      "vmov.u8    d1, d0                         \n"
+      "vmov.u8    d2, d0                         \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vqadd.u8   q0, q0, q1                     \n"  // add
+      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d1, d0, d2                     \n"  // add
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%5                  \n"  // top
+      "vld1.8     {d1}, [%0],%6                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%6                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+      "vld1.8     {d3}, [%2],%6                  \n"
+      "subs       %4, %4, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(src_y2),               // %2
+        "+r"(dst_sobelx),           // %3
+        "+r"(width)                 // %4
+      : "r"(2),                     // %5
+        "r"(6)                      // %6
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%4                  \n"  // left
+      "vld1.8     {d1}, [%1],%4                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%4                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%0],%5                  \n"  // right
+      "vld1.8     {d3}, [%1],%5                  \n"
+      "subs       %3, %3, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(dst_sobely),           // %2
+        "+r"(width)                 // %3
+      : "r"(1),                     // %4
+        "r"(6)                      // %5
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "vdup.32    q0, %3                         \n"
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
+      "vmul.f32   q3, q3, q0                     \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
+      : "r"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// TODO(fbarchard): multiply by element.
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vdup.32    q0, %3                         \n"
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
+      "vmul.f32   q3, q3, q0                     \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "r"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "vdup.32    q0, %3                         \n"
+
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u8   q1, d2                         \n"  // 8 shorts
+      "vmovl.u16  q2, d2                         \n"  // 8 ints
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, d0[0]                  \n"  // scale
+      "vmul.f32   q3, q3, d0[0]                  \n"
+      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "r"(scale)   // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_neon64.cc b/media/libyuv/libyuv/source/row_neon64.cc
new file mode 100644
index 0000000000..24b4520bab
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_neon64.cc
@@ -0,0 +1,2884 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.s}[0], [%1], #4            \n" \
+  "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.d}[0], [%1], #8            \n" \
+  "ld1        {v1.d}[1], [%2], #8            \n" \
+  "uaddlp     v1.8h, v1.16b                  \n" \
+  "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                 \
+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                 \
+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
+  "orr        v0.8b, v3.8b, v3.8b            \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUVTORGB_SETUP                           \
+  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+
+#define YUVTORGB(vR, vG, vB)                                        \
+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
+  "ushll      v0.4s, v0.4h, #0               \n"                    \
+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
+  "uxtl       v2.8h, v2.8b                   \n"                    \
+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
+  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
+  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
+  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
+  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
+  "sqadd      " #vB                                                 \
+  ".8h, v24.8h, v0.8h      \n" /* B */                              \
+  "sqadd      " #vG                                                 \
+  ".8h, v25.8h, v0.8h      \n" /* G */                              \
+  "sqadd      " #vR                                                 \
+  ".8h, v26.8h, v0.8h      \n" /* R */                              \
+  "sqadd      " #vB ".8h, " #vB                                     \
+  ".8h, v3.8h  \n" /* B */                                          \
+  "sqsub      " #vG ".8h, " #vG                                     \
+  ".8h, v6.8h  \n" /* G */                                          \
+  "sqadd      " #vR ".8h, " #vR                                     \
+  ".8h, v7.8h  \n" /* R */                                          \
+  "sqshrun    " #vB ".8b, " #vB                                     \
+  ".8h, #6     \n" /* B */                                          \
+  "sqshrun    " #vG ".8b, " #vG                                     \
+  ".8h, #6     \n"                               /* G */            \
+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "ld1        {v23.8b}, [%3], #8             \n"
+    "subs       %w5, %w5, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v20.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+#define ARGBTORGB565                                                        \
+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB4444                                                       \
+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUV400
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n"
+      "ld1        {v20.8b}, [%0], #8             \n"
+      "orr        v21.8b, v20.8b, v20.8b         \n"
+      "orr        v22.8b, v20.8b, v20.8b         \n"
+      "subs       %w2, %w2, #8                   \n"
+      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb24),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_rgb24),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store U
+      "st1        {v1.16b}, [%2], #16            \n"  // store V
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load U
+      "ld1        {v1.16b}, [%1], #16            \n"  // load V
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+      "b.gt       1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store R
+      "st1        {v1.16b}, [%2], #16            \n"  // store G
+      "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load R
+      "ld1        {v1.16b}, [%1], #16            \n"  // load G
+      "ld1        {v2.16b}, [%2], #16            \n"  // load B
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "b.gt       1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp        q0, q1, [%0], #32              \n"
+      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+      "stp        q0, q1, [%1], #32              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "v0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw              \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+      "rev64      v0.16b, v0.16b                 \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w3, sxtw #1           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
+      "rev64      v0.8b, v0.8b                   \n"
+      "rev64      v1.8b, v1.8b                   \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+      "st1        {v1.8b}, [%2], #8              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"((ptrdiff_t)-16)  // %4
+      : "cc", "memory", "v0", "v1");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw #2           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "rev64      v0.4s, v0.4s                   \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "movi       v4.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v5.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+      );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
+                                                      // RGB24.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
+      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
+      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
+      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
+      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),   // %0
+        "+r"(src_uyvyb),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+      "orr        v2.8b, v1.8b, v1.8b            \n"
+      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+      "orr        v3.8b, v2.8b, v2.8b            \n"
+      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "dup        v1.4s, %w2                     \n"  // dither4
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v20.8b, v20.8b, v1.8b          \n"
+      "uqadd      v21.8b, v21.8b, v1.8b          \n"
+      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
+      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB1555.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "movi       v4.16b, #0x0f                  \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB4444.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "movi       v24.8b, #112                   \n"  // UB / VR 0.875
+                                                      // coefficient
+      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+      "movi       v29.16b,#0x80                  \n"  // 128.5
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+                                                            // pixels.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+      "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
+}
+
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+// clang-format off
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile(
+      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
+                                                      // 2
+      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v17.D[0]             \n"
+      "ins        v18.D[1], v19.D[0]             \n"
+      "ins        v20.D[1], v21.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v18.8h, #1              \n"
+      "urshr      v6.8h, v20.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+      "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+      "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+      "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+      "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+      "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),    // %0
+        "+r"(src_rgb565_1),  // %1
+        "+r"(dst_u),         // %2
+        "+r"(dst_v),         // %3
+        "+r"(width)          // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v27");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),    // %0
+        "+r"(src_argb1555_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),    // %0
+        "+r"(src_argb4444_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28"
+
+      );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+        "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  asm volatile(
+      "cmp        %w4, #0                        \n"
+      "b.eq       100f                           \n"
+      "cmp        %w4, #128                      \n"
+      "b.eq       50f                            \n"
+
+      "dup        v5.16b, %w4                    \n"
+      "dup        v4.16b, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "umull      v2.8h, v0.8b,  v4.8b           \n"
+      "umull2     v3.8h, v0.16b, v4.16b          \n"
+      "umlal      v2.8h, v1.8b,  v5.8b           \n"
+      "umlal2     v3.8h, v1.16b, v5.16b          \n"
+      "rshrn      v0.8b,  v2.8h, #8              \n"
+      "rshrn2     v0.16b, v3.8h, #8              \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       1b                             \n"
+      "b          99f                            \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       50b                            \n"
+      "b          99f                            \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       100b                           \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "+r"(src_ptr1),     // %2
+        "+r"(dst_width),    // %3
+        "+r"(y1_fraction),  // %4
+        "+r"(y0_fraction)   // %5
+      :
+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %w3, %w3, #8                   \n"
+      "b.lt       89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+                                                            // pixels
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+                                                            // pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.ge       8b                             \n"
+
+      "89:                                       \n"
+      "adds       %w3, %w3, #8-1                 \n"
+      "b.lt       99f                            \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge       1b                             \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "dup        v4.8h, %w2                     \n"
+      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+      "dup        v5.8h, %w3                     \n"  // interval multiply.
+      "dup        v6.8h, %w4                     \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
+      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
+      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
+      "uxtl       v1.8h, v1.8b                   \n"
+      "uxtl       v2.8h, v2.8b                   \n"
+      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+      "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+      "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+      "add        v1.8h, v1.8h, v6.8h            \n"  // g
+      "add        v2.8h, v2.8h, v6.8h            \n"  // r
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "uqxtn      v1.8b, v1.8h                   \n"
+      "uqxtn      v2.8b, v2.8h                   \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+      "uxtl       v5.8h, v5.8b                   \n"
+      "uxtl       v6.8h, v6.8b                   \n"
+      "uxtl       v7.8h, v7.8b                   \n"
+      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+      "uqxtn      v4.8b, v4.8h                   \n"
+      "uqxtn      v5.8b, v5.8h                   \n"
+      "uqxtn      v6.8b, v6.8h                   \n"
+      "uqxtn      v7.8b, v7.8h                   \n"
+      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v20.8b, #17                    \n"  // BB coefficient
+      "movi       v21.8b, #68                    \n"  // BG coefficient
+      "movi       v22.8b, #35                    \n"  // BR coefficient
+      "movi       v24.8b, #22                    \n"  // GB coefficient
+      "movi       v25.8b, #88                    \n"  // GG coefficient
+      "movi       v26.8b, #45                    \n"  // GR coefficient
+      "movi       v28.8b, #24                    \n"  // BB coefficient
+      "movi       v29.8b, #98                    \n"  // BG coefficient
+      "movi       v30.8b, #50                    \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
+      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
+      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
+      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
+      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
+      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
+      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
+      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
+      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
+      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
+      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
+      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+      "uxtl       v17.8h, v17.8b                 \n"  // g
+      "uxtl       v18.8h, v18.8b                 \n"  // r
+      "uxtl       v19.8h, v19.8b                 \n"  // a
+      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"
+      "uqadd      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqsub      v0.8b, v0.8b, v4.8b            \n"
+      "uqsub      v1.8b, v1.8b, v5.8b            \n"
+      "uqsub      v2.8b, v2.8b, v6.8b            \n"
+      "uqsub      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+      "orr        v1.8b, v0.8b, v0.8b            \n"
+      "orr        v2.8b, v0.8b, v0.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%5               \n"  // top
+      "ld1        {v1.8b}, [%0],%6               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%6               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+      "ld1        {v3.8b}, [%2],%6               \n"
+      "subs       %w4, %w4, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(src_y2),                           // %2
+        "+r"(dst_sobelx),                       // %3
+        "+r"(width)                             // %4
+      : "r"(2LL),                               // %5
+        "r"(6LL)                                // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%4               \n"  // left
+      "ld1        {v1.8b}, [%1],%4               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%4               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%0],%5               \n"  // right
+      "ld1        {v3.8b}, [%1],%5               \n"
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(dst_sobely),                       // %2
+        "+r"(width)                             // %3
+      : "r"(1LL),                               // %4
+        "r"(6LL)                                // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
+      "fcvtn2     v1.8h, v3.4s                   \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
+      "uqshrn2    v1.8h, v3.4s, #13              \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
+      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
+      "fmax       v6.4s, v6.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
+      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fmax)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fsum;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"  // max
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
+      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
+      "fmla       v6.4s, v2.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "faddp      v5.4s, v5.4s, v6.4s            \n"
+      "faddp      v5.4s, v5.4s, v5.4s            \n"
+      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fsum)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "movi       v6.8h, #4                      \n"  // constant 4
+      "movi       v7.8h, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
+      "ld1        {v2.8h}, [%4], #16             \n"
+      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
+      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
+      "ld1        {v2.8h}, [%1], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "ld1        {v2.8h}, [%2], #16             \n"
+      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
+      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
+      "ld1        {v2.8h}, [%3], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
+      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "movi       v6.4s, #4                      \n"  // constant 4
+      "movi       v7.4s, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
+      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
+      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
+      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
+      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
+      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
+      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
+      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
+      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
+      "add        v3.4s, v3.4s, v5.4s            \n"
+      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
+      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
+      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
+      "uqrshrn2   v0.8h, v1.4s, #8               \n"
+      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      : "r"(32LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/row_win.cc b/media/libyuv/libyuv/source/row_win.cc
new file mode 100644
index 0000000000..5500d7f5a6
--- /dev/null
+++ b/media/libyuv/libyuv/source/row_win.cc
@@ -0,0 +1,6234 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// 64 bit
+#if defined(_M_X64)
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422                                        \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                       \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;                                             \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
+  a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants)                                     \
+  xmm1 = _mm_loadu_si128(&xmm0);                                   \
+  xmm2 = _mm_loadu_si128(&xmm0);                                   \
+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+  xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB                                    \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
+  xmm1 = _mm_loadu_si128(&xmm0);                     \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+  dst_argb += 32;
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+  while (width > 0) {
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+  while (width > 0) {
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+// 32 bit
+#else  // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kARGBToV = {
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_y
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
+    pslld      xmm5, 24
+
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_y
+    mov         edx, [esp + 8]  // dst_argb
+    mov         ecx, [esp + 12]  // width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_rgb24
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                                           uint8_t* dst_rgb24,
+                                           int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_rgb24
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 4]
+    movdqu    xmm2, [eax + 8]
+    lea       eax, [eax + 24]
+    pshufb    xmm0, xmm3
+    pshufb    xmm1, xmm4
+    pshufb    xmm2, xmm5
+    movq      qword ptr [edx], xmm0
+    movq      qword ptr [edx + 8], xmm1
+    movq      qword ptr [edx + 16], xmm2
+    lea       edx, [edx + 24]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]  // src_rgb565
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2  // RB
+    pand      xmm0, xmm4  // G in middle 6 bits
+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
+    por       xmm0, xmm7  // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]  // src_rgb565
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]  // src_argb1555
+    mov        edx,  [esp + 8]  // dst_argb
+    mov        ecx,  [esp + 12]  // width
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpsraw     ymm2, ymm0, 8  // A
+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]  // src_argb4444
+    mov       edx,  [esp + 8]  // dst_argb
+    mov       ecx,  [esp + 12]  // width
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5  // mask high nibbles
+    vpand      ymm0, ymm0, ymm4  // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]  // src_argb1555
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2  // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4  // G in middle 5 bits
+    psraw     xmm2, 8  // A
+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2  // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
+  __asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]  // src_argb4444
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4  // mask low nibbles
+    pand      xmm2, xmm5  // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+                                          uint8_t* dst_rgb,
+                                          int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
+  __asm {
+
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    movd      xmm6, [esp + 12]  // dither4
+    mov       ecx, [esp + 16]  // width
+    punpcklbw xmm6, xmm6  // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6  // add dither
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]  // width
+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6  // add dither
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4  // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4  // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    movdqa    xmm3, xmm0  // R
+    psrad     xmm0, 16  // A
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 6  // G
+    psrld     xmm3, 9  // R
+    pand      xmm0, xmm7  // A
+    pand      xmm1, xmm4  // B
+    pand      xmm2, xmm5  // G
+    pand      xmm3, xmm6  // R
+    por       xmm0, xmm1  // BA
+    por       xmm2, xmm3  // GR
+    por       xmm0, xmm2  // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+ convertloop:
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3  // low nibble
+    pand      xmm1, xmm4  // high nibble
+    psrld     xmm0, 4
+    psrld     xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9  // R
+    vpsrld     ymm2, ymm0, 6  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrad     ymm0, ymm0, 16  // A
+    vpand      ymm3, ymm3, ymm6  // R
+    vpand      ymm2, ymm2, ymm5  // G
+    vpand      ymm1, ymm1, ymm4  // B
+    vpand      ymm0, ymm0, ymm7  // A
+    vpor       ymm0, ymm0, ymm1  // BA
+    vpor       ymm2, ymm2, ymm3  // GR
+    vpor       ymm0, ymm0, ymm2  // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4  // high nibble
+    vpand      ymm0, ymm0, ymm3  // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_y,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToY
+    vbroadcastf128 ymm5, xmmword ptr kAddY16
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kBGRAToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kABGRToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kRGBAToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx  // stride from u to v
+
+ convertloop:
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5  // -> unsigned
+
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                                          int src_stride_argb,
+                                          uint8_t* dst_u,
+                                          uint8_t* dst_v,
+                                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm6, xmmword ptr kARGBToVJ
+    movdqa     xmm7, xmmword ptr kARGBToUJ
+    sub        edi, edx  // stride from u to v
+
+ convertloop:
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                                        int src_stride_argb,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    sub        edi, edx   // stride from u to v
+
+ convertloop:
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    sub        edi, edx   // stride from u to v
+
+ convertloop:
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
+    vpaddw     ymm0, ymm0, ymm5
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+                                            uint8_t* dst_u,
+                                            uint8_t* dst_v,
+                                            int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx    // stride from u to v
+
+ convertloop:
+        /* convert to U and V */
+    movdqu     xmm0, [eax]  // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+
+    movdqu     xmm0, [eax]  // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kBGRAToV
+    movdqa     xmm7, xmmword ptr kBGRAToU
+    sub        edi, edx  // stride from u to v
+
+ convertloop:
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5  // -> unsigned
+
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kABGRToV
+    movdqa     xmm7, xmmword ptr kABGRToU
+    sub        edi, edx  // stride from u to v
+
+ convertloop:
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5  // -> unsigned
+
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kRGBAToV
+    movdqa     xmm7, xmmword ptr kRGBAToU
+    sub        edi, edx  // stride from u to v
+
+ convertloop:
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5  // -> unsigned
+
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+  __asm {                                                \
+    __asm vmovdqu    xmm0, [esi] /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+  __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2 \
+  __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vmovdqu    xmm5, [ebp] /* A */                      \
+    __asm vpermq     ymm5, ymm5, 0xd8                                          \
+    __asm lea        ebp, [ebp + 16]}
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm lea        eax, [eax + 32]}
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm lea        eax, [eax + 32]}
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) \
+  __asm {                                    \
+    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]}
+
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
+    __asm vmovdqu    [edx], ymm0                                               \
+    __asm vmovdqu    [edx + 32], ymm1                                          \
+    __asm lea        edx,  [edx + 64]}
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I422ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV12ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV21ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV21_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUY2_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void UYVYToARGBRow_AVX2(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READUYVY_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked) void I422ToRGBARow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STORERGBA_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
+
+// Read 8 UV from 444.
+#define READYUV444 \
+  __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+  __asm {                                                     \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422 \
+  __asm {                                                    \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm lea        ebp, [ebp + 8]}
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 \
+  __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 \
+  __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* UYVY */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm lea        eax, [eax + 16]}
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) \
+  __asm {                                         \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
+    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
+    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
+    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm psubw      xmm2, xmm3                                                \
+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0 /* B */                              \
+    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm2, xmm2 /* R */             \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB \
+  __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]}
+
+// Store 8 BGRA values.
+#define STOREBGRA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]}
+
+// Store 8 RGBA values.
+#define STORERGBA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]}
+
+// Store 8 RGB24 values.
+#define STORERGB24 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]}
+
+// Store 8 RGB565 values.
+#define STORERGB565 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0 /* G */                                     \
+    __asm pslld      xmm0, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm0, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm0, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm0, xmm3 /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1 /* G */                                     \
+    __asm pslld      xmm1, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm1, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm1, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm1, xmm3 /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]}
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I444ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV444
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* rgb565_buf,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
+    pslld      xmm7, 11
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGB565
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I422ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUVA422
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV21
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUY2
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
+
+ convertloop:
+    READUYVY
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    ret
+  }
+}
+
+__declspec(naked) void I422ToRGBARow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgba,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGBA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
+  __asm {
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
+
+ convertloop:
+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0  // Y.Y
+    pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
+    packuswb   xmm0, xmm0        // G
+
+        // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0  // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
+  __asm {
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
+
+ convertloop:
+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
+
+        // TODO(fbarchard): Weave alpha with unpack.
+        // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+                                       uint8_t* dst,
+                                       int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    pshufb    xmm0, xmm5
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+                                      uint8_t* dst,
+                                      int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
+    mov       ecx, [esp + 12]  // width
+    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]  // src
+    mov       edx, [esp + 4 + 8]  // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
+    mov       ecx, [esp + 12]  // width
+    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+
+ convertloop:
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5  // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8  // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8  // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5  // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]  // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1  // first 8 UV pairs
+    punpckhbw  xmm2, xmm1  // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]  // read 32 U's
+    vmovdqu    ymm1, [eax + edx]  // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0  // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    test       eax, 15
+    jne        convertloopu
+    test       edx, 15
+    jne        convertloopu
+
+  convertloopa:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopa
+    ret
+
+  convertloopu:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopu
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+                                   uint8_t* dst,
+                                   int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]  // src
+    mov        edi, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    rep movsb
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+
+  extractloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm0, 24
+    psrld      xmm1, 24
+    packssdw   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         extractloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
+
+  extractloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpsrld     ymm0, ymm0, 24
+    vpsrld     ymm1, ymm1, 24
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    lea        eax, [eax + 128]
+    vpackssdw  ymm0, ymm0, ymm1  // mutates
+    vpsrld     ymm2, ymm2, 24
+    vpsrld     ymm3, ymm3, 24
+    vpackssdw  ymm2, ymm2, ymm3  // mutates
+    vpackuswb  ymm0, ymm0, ymm2  // mutates
+    vpermd     ymm0, ymm4, ymm0  // unmutate
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         extractloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+  __asm {
+    movzx      eax, byte ptr [esp + 8]  // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx  // overwrites edx with upper part of result.
+    mov        edx, edi
+    mov        edi, [esp + 4]  // dst
+    mov        ecx, [esp + 12]  // width
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v8
+    mov        ecx, [esp + 12]  // width
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+                                      uint32_t v32,
+                                      int width) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v32
+    mov        ecx, [esp + 12]  // width
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5  // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8  // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8  // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8  // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8  // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5  // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8  // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8  // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8  // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8  // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8  // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5  // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8  // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5  // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8  // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                                           const uint8_t* src1,
+                                           const uint8_t* alpha,
+                                           uint8_t* dst,
+                                           int width) {
+  __asm {
+    push       esi
+    push       edi
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm6, eax
+    pshufd     xmm6, xmm6, 0x00
+
+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0x00
+    mov        eax, [esp + 8 + 4]  // src0
+    mov        edx, [esp + 8 + 8]  // src1
+    mov        esi, [esp + 8 + 12]  // alpha
+    mov        edi, [esp + 8 + 16]  // dst
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        eax, esi
+    sub        edx, esi
+    sub        edi, esi
+
+        // 8 pixel loop.
+  convertloop8:
+    movq       xmm0, qword ptr [esi]  // alpha
+    punpcklbw  xmm0, xmm0
+    pxor       xmm0, xmm5  // a, 255-a
+    movq       xmm1, qword ptr [eax + esi]  // src0
+    movq       xmm2, qword ptr [edx + esi]  // src1
+    punpcklbw  xmm1, xmm2
+    psubb      xmm1, xmm6  // bias src0/1 - 128
+    pmaddubsw  xmm0, xmm1
+    paddw      xmm0, xmm7  // unbias result - 32768 and round.
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi + esi], xmm0
+    lea        esi, [esi + 8]
+    sub        ecx, 8
+    jg         convertloop8
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+                                          const uint8_t* src1,
+                                          const uint8_t* alpha,
+                                          uint8_t* dst,
+                                          int width) {
+  __asm {
+    push        esi
+    push        edi
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
+    vpsllw      ymm5, ymm5, 8
+    mov         eax, 0x80808080  // 128 for biasing image to signed.
+    vmovd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    vmovd       xmm7, eax
+    vbroadcastss ymm7, xmm7
+    mov         eax, [esp + 8 + 4]  // src0
+    mov         edx, [esp + 8 + 8]  // src1
+    mov         esi, [esp + 8 + 12]  // alpha
+    mov         edi, [esp + 8 + 16]  // dst
+    mov         ecx, [esp + 8 + 20]  // width
+    sub         eax, esi
+    sub         edx, esi
+    sub         edi, esi
+
+        // 32 pixel loop.
+  convertloop32:
+    vmovdqu     ymm0, [esi]  // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5  // a, 255-a
+    vpxor       ymm0, ymm0, ymm5  // a, 255-a
+    vmovdqu     ymm1, [eax + esi]  // src0
+    vmovdqu     ymm2, [edx + esi]  // src1
+    vpunpckhbw  ymm4, ymm1, ymm2
+    vpunpcklbw  ymm1, ymm1, ymm2
+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpmaddubsw  ymm0, ymm0, ymm1
+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
+    vpsrlw      ymm3, ymm3, 8
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm3
+    vmovdqu     [edi + esi], ymm0
+    lea         esi, [esi + 32]
+    sub         ecx, 32
+    jg          convertloop32
+
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time.
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                                          const uint8_t* src_argb1,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b  // less than 4 pixels?
+
+        // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]  // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movdqu     xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movdqu     xmm1, [esi]  // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+        // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]  // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movd       xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movd       xmm1, [esi]  // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, xmmword ptr kShuffleAlpha0
+    movdqa     xmm5, xmmword ptr kShuffleAlpha1
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    pshufb     xmm0, xmm4  // isolate first 2 alphas
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1  // rgb * a
+    movdqu     xmm1, [eax]  // read 4 pixels
+    pshufb     xmm1, xmm5  // isolate next 2 alphas
+    movdqu     xmm2, [eax]  // read 4 pixels
+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2  // rgb * a
+    movdqu     xmm2, [eax]  // mask original alpha
+    lea        eax, [eax + 16]
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2  // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                                             uint8_t* dst_argb,
+                                             int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    lea        ebx, fixed_invtbl8
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0  // first 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2  // rgb * a
+
+    movdqu     xmm1, [eax]  // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1  // next 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2  // rgb * a
+    lea        eax, [eax + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else   // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
+  __asm {
+
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    sub        edx, eax
+    lea        ebx, fixed_invtbl8
+    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+        // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]  // alpha0
+    movzx      edi, byte ptr [eax + 7]  // alpha1
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]  // alpha2
+    movzx      edi, byte ptr [eax + 15]  // alpha3
+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]  // alpha4
+    movzx      edi, byte ptr [eax + 23]  // alpha5
+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]  // alpha6
+    movzx      edi, byte ptr [eax + 31]  // alpha7
+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0  // 8 G bytes
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2  // 8 A bytes
+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0  // 8 GG words
+    punpcklbw  xmm3, xmm2  // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3  // GGGA first 4
+    punpckhwd  xmm1, xmm3  // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4] /* dst_argb */
+    mov        ecx, [esp + 8] /* width */
+    movdqa     xmm2, xmmword ptr kARGBToSepiaB
+    movdqa     xmm3, xmmword ptr kARGBToSepiaG
+    movdqa     xmm4, xmmword ptr kARGBToSepiaR
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0  // 8 B values
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5  // 8 R values
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm5, xmm6  // 8 RA values
+    movdqa     xmm1, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm5  // BGRA first 4
+    punpckhwd  xmm1, xmm5  // BGRA next 4
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                                                uint8_t* dst_argb,
+                                                const int8_t* matrix_argb,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16] /* width */
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7  // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7  // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
+    packuswb   xmm1, xmm1  // 8 R values
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm1, xmm6  // 8 RA values
+    movdqa     xmm6, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm1  // BGRA first 4
+    punpckhwd  xmm6, xmm1  // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
+    lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                                            int scale,
+                                            int interval_size,
+                                            int interval_offset,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4] /* dst_argb */
+    movd       xmm2, [esp + 8] /* scale */
+    movd       xmm3, [esp + 12] /* interval_size */
+    movd       xmm4, [esp + 16] /* interval_offset */
+    mov        ecx, [esp + 20] /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3  // * interval_size
+    movdqu     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    movdqu     [eax], xmm0
+    lea        eax, [eax + 16]
+    sub        ecx, 4
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width,
+                                         uint32_t value) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]  // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]  // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5  // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1  // low 4
+    vpunpckhbw ymm1, ymm1, ymm1  // high 4
+    vpunpcklbw ymm2, ymm3, ymm5  // low 4
+    vpunpckhbw ymm3, ymm3, ymm5  // high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      const uint8_t* src_y2,
+                                      uint8_t* dst_sobelx,
+                                      int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      uint8_t* dst_sobely,
+                                      int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+                                     const uint8_t* src_sobely,
+                                     uint8_t* dst_argb,
+                                     int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5  // alpha 255
+    pslld      xmm5, 24  // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0  // GG
+    punpcklbw  xmm2, xmm0  // First 8
+    punpckhbw  xmm0, xmm0  // Next 8
+    movdqa     xmm1, xmm2  // GGGG
+    punpcklwd  xmm1, xmm2  // First 4
+    punpckhwd  xmm2, xmm2  // Next 4
+    por        xmm1, xmm5  // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0  // GGGG
+    punpcklwd  xmm3, xmm0  // Next 4
+    punpckhwd  xmm0, xmm0  // Last 4
+    por        xmm3, xmm5  // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                                            const uint8_t* src_sobely,
+                                            uint8_t* dst_y,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                                       const uint8_t* src_sobely,
+                                       uint8_t* dst_argb,
+                                       int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5  // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0  // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1  // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4  // YSXA
+    punpcklwd  xmm6, xmm3  // First 4
+    punpckhwd  xmm4, xmm3  // Next 4
+    movdqa     xmm7, xmm1  // YSXA
+    punpcklwd  xmm7, xmm0  // Next 4
+    punpckhwd  xmm1, xmm0  // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm5, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0  // area
+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6  // (65536.0 + area - 1)
+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
+    packssdw   xmm5, xmm5  // 16 bit shorts
+
+        // 4 pixel loop small blocks.
+  s4:
+        // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
+            // 4 pixel loop
+  l4:
+        // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+        // 1 pixel loop
+  l1:
+    movdqu     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+        // 4 pixel loop
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqu     xmm3, [esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqu     xmm4, [esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
+    paddd      xmm5, xmm0
+
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+        // 1 pixel loop
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                                                     int src_argb_stride,
+                                                     uint8_t* dst_argb,
+                                                     const float* uv_dudv,
+                                                     int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]  // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16  // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+        // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2  // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4  // dudv *= 2
+    movdqa     xmm3, xmm2  // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4  // dudv *= 4
+
+        // 4 pixel loop
+  l4:
+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
+    packssdw   xmm0, xmm1  // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
+    addps      xmm2, xmm4  // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
+    addps      xmm3, xmm4  // x, y += dx, dy next 2
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+        // 1 pixel loop
+  l1:
+    cvttps2dq  xmm0, xmm2  // x, y float to int
+    packssdw   xmm0, xmm0  // x, y as shorts
+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
+    addps      xmm2, xmm7  // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                                           const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           int dst_width,
+                                           int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 128
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
+
+    vmovd      xmm0, eax  // high fraction 0..255
+    neg        eax
+    add        eax, 256
+    vmovd      xmm5, eax  // low fraction 256..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vbroadcastss ymm5, xmm5
+
+    mov        eax, 0x80808080  // 128b for bias and rounding.
+    vmovd      xmm4, eax
+    vbroadcastss ymm4, xmm4
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2
+    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
+    vpsubb     ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm5, ymm1
+    vpmaddubsw ymm0, ymm5, ymm0
+    vpaddw     ymm1, ymm1, ymm4  // unbias and round
+    vpaddw     ymm0, ymm0, ymm4
+    vpsrlw     ymm1, ymm1, 8
+    vpsrlw     ymm0, ymm0, 8
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+        // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+        // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                                            const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            int dst_width,
+                                            int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+        // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 /256.  Blend 100 / 0.
+    cmp        eax, 128
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
+
+    movd       xmm0, eax  // high fraction 0..255
+    neg        eax
+    add        eax, 256
+    movd       xmm5, eax  // low fraction 255..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm4, eax
+    pshufd     xmm4, xmm4, 0x00
+
+  xloop:
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    psubb      xmm0, xmm4            // bias image by -128
+    psubb      xmm1, xmm4
+    movdqa     xmm2, xmm5
+    movdqa     xmm3, xmm5
+    pmaddubsw  xmm2, xmm0
+    pmaddubsw  xmm3, xmm1
+    paddw      xmm2, xmm4
+    paddw      xmm3, xmm4
+    psrlw      xmm2, 8
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [esi + edi], xmm2
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+        // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+        // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_argb,
+                                            const uint8_t* shuffler,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]  // width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                                           uint8_t* dst_argb,
+                                           const uint8_t* shuffler,
+                                           int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
+    mov        ecx, [esp + 16]  // width
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2  // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0  // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4] /* src_argb */
+    mov        edx, [esp + 4 + 8] /* dst_argb */
+    mov        esi, [esp + 4 + 12] /* poly */
+    mov        ecx, [esp + 4 + 16] /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+        // 2 pixel loop.
+ convertloop:
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* poly */
+    vbroadcastf128 ymm4, [ecx]  // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16] /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0  // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    mulss      xmm4, kExpBias
+    pshufd     xmm4, xmm4, 0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+        // 8 pixel loop.
+ convertloop:
+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
+    add         eax, 16
+    movdqa      xmm3, xmm2
+    punpcklwd   xmm2, xmm5
+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
+    punpckhwd   xmm3, xmm5
+    cvtdq2ps    xmm3, xmm3
+    mulps       xmm2, xmm4
+    mulps       xmm3, xmm4
+    psrld       xmm2, 13
+    psrld       xmm3, 13
+    packssdw    xmm2, xmm3
+    movdqu      [eax + edx - 16], xmm2
+    sub         ecx, 8
+    jg          convertloop
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+
+    vmulss     xmm4, xmm4, kExpBias
+    vbroadcastss ymm4, xmm4
+    vpxor      ymm5, ymm5, ymm5
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vmovdqu     ymm2, [eax]  // 16 shorts
+    add         eax, 32
+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
+    vpunpcklwd  ymm2, ymm2, ymm5
+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
+    vcvtdq2ps   ymm2, ymm2
+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
+    vmulps      ymm2, ymm2, ymm4
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm2, ymm2, 13
+    vpackssdw   ymm2, ymm2, ymm3
+    vmovdqu     [eax + edx - 32], ymm2
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    vbroadcastss ymm4, [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
+    add         eax, 32
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm3, ymm3
+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
+    vmulps      ymm3, ymm3, ymm4
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm3, ymm3, 3
+    vmovdqu     [eax + edx + 32], xmm2
+    vmovdqu     [eax + edx + 32 + 16], xmm3
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                                             const uint8_t* table_argb,
+                                             int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+                                            const uint8_t* table_argb,
+                                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                                   uint8_t* dst_argb,
+                                                   int width,
+                                                   const uint8_t* luma,
+                                                   uint32_t lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        ecx, [esp + 8 + 12] /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+        // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(_M_X64)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/media/libyuv/libyuv/source/scale.cc b/media/libyuv/libyuv/source/scale.cc
new file mode 100644
index 0000000000..2cfa1c6cb1
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale.cc
@@ -0,0 +1,1741 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+                                        : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+                                          : ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_NEON
+                                                      : ScaleRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                          : ScaleRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 =
+          filtering == kFilterNone
+              ? ScaleRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                            : ScaleRowDown2Box_SSSE3);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_AVX2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                          : ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_AVX2
+                                                      : ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+                                          : ScaleRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MSA
+                                                      : ScaleRowDown2Box_MSA);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown2_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_16_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+                                        : ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_16_SSE2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+                                          : ScaleRowDown2Box_16_SSE2);
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown4_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane down, 3/4
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+    }
+    if (dst_width % 48 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_MSA;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown34_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  (void)src_width;
+  (void)src_height;
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+  }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_MSA;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown38_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+  uint32_t sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+  uint32_t sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ =
+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+        16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+                     scaletbl[boxwidth - minboxwidth] >>
+                 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  (void)dx;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16_t.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+                        int src_width) = ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_MSA)
+    if (TestCpuFlag(kCpuHasMSA)) {
+      ScaleAddRow = ScaleAddRow_Any_MSA;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_MSA;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint8_t* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16_t*)(row16), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32_t.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+                        int src_width) = ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint16_t* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32_t*)(row32), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row32);
+  }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint8_t* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16_t* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8_t* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8_t* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16_t* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16_t* rowptr = (uint16_t*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+static void ScalePlaneSimple_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                  dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                   dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+                   dst_height, interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_any.cc b/media/libyuv/libyuv/source/scale_any.cc
new file mode 100644
index 0000000000..53ad136404
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_any.cc
@@ -0,0 +1,464 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#undef CANY
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+      ScaleRowDown34_MSA,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+      ScaleRowDown34_0_Box_MSA,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+      ScaleRowDown34_1_Box_MSA,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_argb.cc b/media/libyuv/libyuv/source/scale_argb.cc
new file mode 100644
index 0000000000..53a22e8b41
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_argb.cc
@@ -0,0 +1,1010 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleARGBRowDown2_C
+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+                                        : ScaleARGBRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_SSE2
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+                                          : ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_SSE2
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+                                            : ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+                                          : ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+                                            : ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+                                          : ScaleARGBRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+                                            : ScaleARGBRowDown2Box_MSA);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+                      dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8_t* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+                                     : ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+                                     : ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+                                     : ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  int x,
+                                  int dx,
+                                  int y,
+                                  int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8_t* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_argb,
+                                uint8_t* dst_argb,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8_t* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8_t* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8_t* src_y,
+                                     const uint8_t* src_u,
+                                     const uint8_t* src_v,
+                                     uint8_t* dst_argb,
+                                     int x,
+                                     int dx,
+                                     int y,
+                                     int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8_t* src_row_y = src_y + yi * src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8_t* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                        int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+  (void)src_height;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8_t* src,
+                      int src_stride,
+                      int src_width,
+                      int src_height,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int dst_width,
+                      int dst_height,
+                      int clip_x,
+                      int clip_y,
+                      int clip_width,
+                      int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64_t clipf = (int64_t)(clip_x)*dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64_t clipf = (int64_t)(clip_y)*dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+                         src_stride, dst_stride, src, dst, x, dx, y, dy,
+                         filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+                            src_stride, dst_stride, src, dst, x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                  dst_stride, src, dst, x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+            clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+            filtering);
+  return 0;
+}
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
+                       enum FilterMode filtering) {
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+  int r;
+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
+  (void)dst_fourcc;
+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             argb_buffer, src_width * 4, src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                    clip_width, clip_height, filtering);
+  free(argb_buffer);
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_common.cc b/media/libyuv/libyuv/source/scale_common.cc
new file mode 100644
index 0000000000..b28d7da41f
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_common.cc
@@ -0,0 +1,1323 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  const uint8_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
+  int x;
+  (void)src_stride;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  int x;
+  (void)src_stride;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
+  int64_t x = (int64_t)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+  (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx) {
+  int64_t x = (int64_t)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
+  int x;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  int x;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_argb,
+                         int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  (void)src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f)                                                 \
+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+      BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64_t)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering,
+                int* x,
+                int* y,
+                int* dx,
+                int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height, dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint16_t* dst,
+                      int dst_width) {
+  const uint16_t* src2 = src_ptr + src_stride;
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+    ++src_ptr;
+    ++src2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_gcc.cc b/media/libyuv/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000000..312236d2df
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1374 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "psrlw      $0x1,%%xmm0                    \n"
+      "psrlw      $0x1,%%xmm1                    \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrld     $0x18,%%xmm5                    \n"
+      "pslld     $0x10,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stridex3;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                  \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "psllw      $0x3,%%xmm5                    \n"
+      "lea       0x00(%4,%4,2),%3                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "phaddw     %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm5,%%xmm0                  \n"
+      "psrlw      $0x4,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %0,%%xmm3                       \n"
+      "movdqa    %1,%%xmm4                       \n"
+      "movdqa    %2,%%xmm5                       \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm2                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "palignr   $0x8,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movd      %%xmm1,0x8(%1)                  \n"
+      "lea       0xc(%1),%1                      \n"
+      "sub       $0xc,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "movdqa    %3,%%xmm5                       \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pshufb    %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "paddusw   %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "paddusw   %%xmm0,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,(%1)                     \n"
+      "psrlq     $0x10,%%xmm1                    \n"
+      "movd      %%xmm1,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "pshufb    %%xmm3,%%xmm7                   \n"
+      "paddusw   %%xmm7,%%xmm6                   \n"
+      "pmulhuw   %%xmm4,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movd      %%xmm6,(%1)                     \n"
+      "psrlq     $0x10,%%xmm6                    \n"
+      "movd      %%xmm6,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x10(%1),%%xmm1                 \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "paddusw   %%xmm2,%%xmm0                   \n"
+      "paddusw   %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm3                    \n"
+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  intptr_t x0, x1, temp_pixel;
+  asm volatile(
+      "movd      %6,%%xmm2                       \n"
+      "movd      %7,%%xmm3                       \n"
+      "movl      $0x04040000,%k2                 \n"
+      "movd      %k2,%%xmm5                      \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "subl      $0x2,%5                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movzwl    0x00(%1,%4,1),%k2               \n"
+      "movd      %k2,%%xmm4                      \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "punpcklwd %%xmm4,%%xmm0                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb   %%xmm7,%%xmm1                   \n"
+      "pmaddubsw %%xmm0,%%xmm1                   \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,%k2                      \n"
+      "mov       %w2,(%0)                        \n"
+      "lea       0x2(%0),%0                      \n"
+      "subl      $0x2,%5                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "addl      $0x1,%5                         \n"
+      "jl        99f                             \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "paddusb   %%xmm7,%%xmm2                   \n"
+      "pmaddubsw %%xmm0,%%xmm2                   \n"
+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movd      %%xmm2,%k2                      \n"
+      "mov       %b2,(%0)                        \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
+#if defined(__x86_64__)
+        "+rm"(dst_width)  // %5
+#else
+        "+m"(dst_width)  // %5
+#endif
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
+#if defined(__x86_64__)
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
+#else
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  (void)src_stride;
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd      (%0),%%xmm0                     \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%0,%1,2),%%xmm2            \n"
+      "movd      0x00(%0,%4,1),%%xmm3            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "punpckldq %%xmm3,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+      "lea       0x00(%0,%5,1),%5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movhps    0x00(%0,%1,1),%%xmm0            \n"
+      "movq      0x00(%0,%1,2),%%xmm1            \n"
+      "movhps    0x00(%0,%4,1),%%xmm1            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "movq      (%5),%%xmm2                     \n"
+      "movhps    0x00(%5,%1,1),%%xmm2            \n"
+      "movq      0x00(%5,%1,2),%%xmm3            \n"
+      "movhps    0x00(%5,%4,1),%%xmm3            \n"
+      "lea       0x00(%5,%1,4),%5                \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "cmp       $0x0,%4                         \n"
+      "jl        99f                             \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
+
+      LABELALIGN
+      "40:                                       \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "pextrw    $0x7,%%xmm2,%k1                 \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%3,%0,4),%%xmm1            \n"
+      "movd      0x00(%3,%1,4),%%xmm4            \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "punpckldq %%xmm4,%%xmm1                   \n"
+      "punpcklqdq %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "test      $0x2,%4                         \n"
+      "je        29f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%2)                     \n"
+      "lea       0x8(%2),%2                      \n"
+      "29:                                       \n"
+      "test      $0x1,%4                         \n"
+      "je        99f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpckldq %%xmm0,%%xmm0                   \n"
+      "punpckhdq %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movdqa    %0,%%xmm4                       \n"
+      "movdqa    %1,%%xmm5                       \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+      );
+
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "sub       $0x2,%2                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movhps    0x00(%1,%4,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm1                   \n"
+      "pmaddubsw %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%0)                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x2,%2                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "add       $0x1,%2                         \n"
+      "jl        99f                             \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%0)                     \n"
+
+      LABELALIGN "99:                            \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "sub       $0x10001,%%eax                  \n"
+      "sbb       $0x0,%%edx                      \n"
+      "sub       $0x1,%1                         \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_msa.cc b/media/libyuv/libyuv/source/scale_msa.cc
new file mode 100644
index 0000000000..482a521f0d
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_msa.cc
@@ -0,0 +1,949 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
+  {                                          \
+    out0[0] = srcp[indx0[0]];                \
+    out0[1] = srcp[indx0[1]];                \
+    out0[2] = srcp[indx0[2]];                \
+    out0[3] = srcp[indx0[3]];                \
+  }
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3;
+  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_argb);
+    s += 32;
+    t += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int32_t stepx = src_stepx * 4;
+  int32_t data0, data1, data2, data3;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LW(src_argb);
+    data1 = LW(src_argb + stepx);
+    data2 = LW(src_argb + stepx * 2);
+    data3 = LW(src_argb + stepx * 3);
+    SW(data0, dst_argb);
+    SW(data1, dst_argb + 4);
+    SW(data2, dst_argb + 8);
+    SW(data3, dst_argb + 12);
+    src_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  const uint8_t* nxt_argb = src_argb + src_stride;
+  int32_t stepx = src_stepx * 4;
+  int64_t data0, data1, data2, data3;
+  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 dst0;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LD(src_argb);
+    data1 = LD(src_argb + stepx);
+    data2 = LD(src_argb + stepx * 2);
+    data3 = LD(src_argb + stepx * 3);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+    data0 = LD(nxt_argb);
+    data1 = LD(nxt_argb + stepx);
+    data2 = LD(nxt_argb + stepx * 2);
+    data3 = LD(nxt_argb + stepx * 3);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+    reg4 += reg6;
+    reg5 += reg7;
+    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_argb);
+    src_argb += stepx * 4;
+    nxt_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = __msa_aver_u_b(vec1, vec0);
+    dst1 = __msa_aver_u_b(vec3, vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    s += 64;
+    t += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    src_ptr += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  const uint8_t* t2 = s + src_stride * 3;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+    vec0 += __msa_hadd_u_h(src0, src0);
+    vec1 += __msa_hadd_u_h(src1, src1);
+    vec2 += __msa_hadd_u_h(src2, src2);
+    vec3 += __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    reg0 = __msa_hadd_u_w(vec0, vec0);
+    reg1 = __msa_hadd_u_w(vec1, vec1);
+    reg2 = __msa_hadd_u_w(vec2, vec2);
+    reg3 = __msa_hadd_u_w(vec3, vec3);
+    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    s += 64;
+    t0 += 64;
+    t1 += 64;
+    t2 += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x, width;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, vec0;
+  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+  (void)src_stride;
+
+  assert(dst_width % 3 == 0);
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+    dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+    dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+    SD(dst0, dst);
+    SW(dst1, dst + 8);
+    src_ptr += 32;
+    dst += 12;
+  }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8i16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x2AAA;
+    tmp1 *= const_0x2AAA;
+    tmp4 *= const_0x4000;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, src4, src5, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8u16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x1C71;
+    tmp1 *= const_0x1C71;
+    tmp4 *= const_0x2AAA;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t0 += 32;
+    t1 += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  v16u8 src0;
+  v8u16 dst0, dst1;
+  v16i8 zero = {0};
+
+  assert(src_width > 0);
+
+  for (x = 0; x < src_width; x += 16) {
+    src0 = LD_UB(src_ptr);
+    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    ST_UH2(dst0, dst1, dst_ptr, 8);
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+}
+
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  int j;
+  v4i32 vec_x = __msa_fill_w(x);
+  v4i32 vec_dx = __msa_fill_w(dx);
+  v4i32 vec_const = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 reg0, reg1;
+  v16u8 dst0;
+  v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
+  v4i32 const_0x40 = __msa_fill_w(0x40);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 16) {
+    vec2 = vec_x >> 16;
+    vec6 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    vec7 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec4 = vec_x >> 16;
+    vec8 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec5 = vec_x >> 16;
+    vec9 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec6 >>= 9;
+    vec7 >>= 9;
+    vec8 >>= 9;
+    vec9 >>= 9;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
+    vec2 += 1;
+    vec3 += 1;
+    vec4 += 1;
+    vec5 += 1;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
+    tmp4 -= tmp0;
+    tmp5 -= tmp1;
+    tmp6 -= tmp2;
+    tmp7 -= tmp3;
+    tmp4 *= vec6;
+    tmp5 *= vec7;
+    tmp6 *= vec8;
+    tmp7 *= vec9;
+    tmp4 += const_0x40;
+    tmp5 += const_0x40;
+    tmp6 += const_0x40;
+    tmp7 += const_0x40;
+    tmp4 >>= 7;
+    tmp5 >>= 7;
+    tmp6 >>= 7;
+    tmp7 >>= 7;
+    tmp0 += tmp4;
+    tmp1 += tmp5;
+    tmp2 += tmp6;
+    tmp3 += tmp7;
+    reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    __msa_st_b(dst0, dst_ptr, 0);
+    dst_ptr += 16;
+  }
+}
+
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  v4i32 x_vec = __msa_fill_w(x);
+  v4i32 dx_vec = __msa_fill_w(dx);
+  v4i32 const_vec = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2;
+  v4i32 dst0;
+
+  vec0 = dx_vec * const_vec;
+  vec1 = dx_vec * 4;
+  x_vec += vec0;
+
+  for (j = 0; j < dst_width; j += 4) {
+    vec2 = x_vec >> 16;
+    x_vec += vec1;
+    LOAD_INDEXED_DATA(src, vec2, dst0);
+    __msa_st_w(dst0, dst, 0);
+    dst += 4;
+  }
+}
+
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  int j;
+  v4u32 src0, src1, src2, src3;
+  v4u32 vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 mult0, mult1, mult2, mult3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 dst0, dst1;
+  v4u32 vec_x = (v4u32)__msa_fill_w(x);
+  v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
+  v4u32 vec_const = {0, 1, 2, 3};
+  v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 8) {
+    vec2 = vec_x >> 16;
+    reg0 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    reg1 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    reg0 = reg0 & const_0x7f;
+    reg1 = reg1 & const_0x7f;
+    reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
+    reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
+    reg2 = reg0 ^ const_0x7f;
+    reg3 = reg1 ^ const_0x7f;
+    mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
+    mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
+    mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
+    mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
+    LOAD_INDEXED_DATA(src, vec2, src0);
+    LOAD_INDEXED_DATA(src, vec3, src1);
+    vec2 += 1;
+    vec3 += 1;
+    LOAD_INDEXED_DATA(src, vec2, src2);
+    LOAD_INDEXED_DATA(src, vec3, src3);
+    reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    tmp0 = __msa_dotp_u_h(reg4, mult0);
+    tmp1 = __msa_dotp_u_h(reg5, mult1);
+    tmp2 = __msa_dotp_u_h(reg6, mult2);
+    tmp3 = __msa_dotp_u_h(reg7, mult3);
+    tmp0 >>= 7;
+    tmp1 >>= 7;
+    tmp2 >>= 7;
+    tmp3 >>= 7;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+    __msa_st_b(dst0, dst_argb, 0);
+    __msa_st_b(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  v16u8 src0, src1, src2, src3;
+  v16u8 vec0, vec1, vec2;
+  v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
+  v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
+  v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
+                 21, 23, 24, 25, 27, 28, 29, 31};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
+    __msa_st_b((v16i8)vec0, dst, 0);
+    __msa_st_b((v16i8)vec1, dst, 16);
+    __msa_st_b((v16i8)vec2, dst, 32);
+    src_ptr += 64;
+    dst += 48;
+  }
+}
+
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 = reg0 * 3 + reg6;
+    reg1 = reg1 * 3 + reg7;
+    reg2 = reg2 * 3 + reg8;
+    reg3 = reg3 * 3 + reg9;
+    reg4 = reg4 * 3 + reg10;
+    reg5 = reg5 * 3 + reg11;
+    reg0 = __msa_srari_h(reg0, 2);
+    reg1 = __msa_srari_h(reg1, 2);
+    reg2 = __msa_srari_h(reg2, 2);
+    reg3 = __msa_srari_h(reg3, 2);
+    reg4 = __msa_srari_h(reg4, 2);
+    reg5 = __msa_srari_h(reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 += reg6;
+    reg1 += reg7;
+    reg2 += reg8;
+    reg3 += reg9;
+    reg4 += reg10;
+    reg5 += reg11;
+    reg0 = __msa_srari_h(reg0, 1);
+    reg1 = __msa_srari_h(reg1, 1);
+    reg2 = __msa_srari_h(reg2, 1);
+    reg3 = __msa_srari_h(reg3, 1);
+    reg4 = __msa_srari_h(reg4, 1);
+    reg5 = __msa_srari_h(reg5, 1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libyuv/libyuv/source/scale_neon.cc b/media/libyuv/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000000..459a2995df
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_neon.cc
@@ -0,0 +1,970 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into q0, odd into q1
+      "vld2.8     {q0, q1}, [%0]!                \n"
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0                         \n"
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+      "vpaddl.u8  q1, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +
+                                                      // row1
+      "vpadal.u8  q1, q3                         \n"
+      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
+                                                      // pack
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vst1.8     {d2}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4
+      "vld1.8     {q1}, [%3]!                    \n"
+      "vld1.8     {q2}, [%4]!                    \n"
+      "vld1.8     {q3}, [%5]!                    \n"
+      "subs       %2, %2, #4                     \n"
+      "vpaddl.u8  q0, q0                         \n"
+      "vpadal.u8  q0, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"
+      "vpadal.u8  q0, q3                         \n"
+      "vpaddl.u16 q0, q0                         \n"
+      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding
+      "vmovn.u16  d0, q0                         \n"
+      "vst1.32    {d0[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_ptr1),   // %3
+        "+r"(src_ptr2),   // %4
+        "+r"(src_ptr3)    // %5
+      :
+      : "q0", "q1", "q2", "q3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #24                    \n"
+      "vmov       d2, d3                         \n"  // order d0, d1, d2
+      "vst3.8     {d0, d1, d2}, [%1]!            \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "d0", "d1", "d2", "d3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "vmovl.u8     q8, d4                       \n"
+      "vmovl.u8     q9, d5                       \n"
+      "vmovl.u8     q10, d6                      \n"
+      "vmovl.u8     q11, d7                      \n"
+
+      // 3 * line_0 + line_1
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vmlal.u8     q9, d1, d24                  \n"
+      "vmlal.u8     q10, d2, d24                 \n"
+      "vmlal.u8     q11, d3, d24                 \n"
+
+      // (3 * line_0 + line_1) >> 2
+      "vqrshrn.u16  d0, q8, #2                   \n"
+      "vqrshrn.u16  d1, q9, #2                   \n"
+      "vqrshrn.u16  d2, q10, #2                  \n"
+      "vqrshrn.u16  d3, q11, #2                  \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q8, d1                       \n"
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vqrshrn.u16  d0, q8, #2                   \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q8, d2                       \n"
+      "vmlal.u8     q8, d3, d24                  \n"
+      "vqrshrn.u16  d2, q8, #2                   \n"
+
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+        "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
+      // average src line 0 with src line 1
+      "vrhadd.u8    q0, q0, q2                   \n"
+      "vrhadd.u8    q1, q1, q3                   \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q3, d1                       \n"
+      "vmlal.u8     q3, d0, d24                  \n"
+      "vqrshrn.u16  d0, q3, #2                   \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q3, d2                       \n"
+      "vmlal.u8     q3, d3, d24                  \n"
+      "vqrshrn.u16  d2, q3, #2                   \n"
+
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                                18, 6, 14, 19, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vld1.8     {q3}, [%3]                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+      "subs       %2, %2, #12                    \n"
+      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+      "vst1.8     {d4}, [%1]!                    \n"
+      "vst1.32    {d5[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile(
+      "vld1.16    {q13}, [%5]                    \n"
+      "vld1.8     {q14}, [%6]                    \n"
+      "vld1.8     {q15}, [%7]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+      "subs         %2, %2, #12                  \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
+      "vtrn.u8      d16, d17                     \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
+      "vtrn.u8      d18, d19                     \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
+      "vpaddl.u8    q8, q8                       \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
+      "vpaddl.u8    d19, d19                     \n"
+
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     q0, q8                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
+      "vadd.u16     d4, d19                      \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "vqrdmulh.s16 q2, q2, q13                  \n"
+      "vmovn.u16    d4, q2                       \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
+      "vmovl.u8     q9, d18                      \n"
+
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
+      "vadd.u16     q1, q9                       \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q15                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
+
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride),    // %3
+        "+r"(src_ptr1)       // %4
+      : "r"(&kMult38_Div6),  // %5
+        "r"(&kShuf38_2),     // %6
+        "r"(&kMult38_Div9)   // %7
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+        "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vld1.16    {q13}, [%4]                    \n"
+      "vld1.8     {q14}, [%5]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "subs         %2, %2, #12                  \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
+
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "vqrshrn.u16  d4, q2, #2                   \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
+
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q13                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
+
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride)     // %3
+      : "r"(&kMult38_Div6),  // %4
+        "r"(&kShuf38_2)      // %5
+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       r12, %5                         \n"
+      "veor      q2, q2, q2                      \n"
+      "veor      q3, q3, q3                      \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "vld1.8     {q0}, [%0], %3                 \n"
+      "vaddw.u8   q3, q3, d1                     \n"
+      "vaddw.u8   q2, q2, d0                     \n"
+      "subs       r12, r12, #1                   \n"
+      "bgt        2b                             \n"
+      "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+      "add        %1, %1, #16                    \n"
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "bgt        1b                             \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_ptr;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vrshrn.s32  d18, q11, #16                 \n"
+    "vrshrn.s32  d19, q12, #16                 \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "cmp          %4, #0                       \n"
+      "beq          100f                         \n"
+      "add          %2, %1                       \n"
+      "cmp          %4, #64                      \n"
+      "beq          75f                          \n"
+      "cmp          %4, #128                     \n"
+      "beq          50f                          \n"
+      "cmp          %4, #192                     \n"
+      "beq          25f                          \n"
+
+      "vdup.8       d5, %4                       \n"
+      "rsb          %4, #256                     \n"
+      "vdup.8       d4, %4                       \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vmull.u8     q13, d0, d4                  \n"
+      "vmull.u8     q14, d1, d4                  \n"
+      "vmlal.u8     q13, d2, d5                  \n"
+      "vmlal.u8     q14, d3, d5                  \n"
+      "vrshrn.u16   d0, q13, #8                  \n"
+      "vrshrn.u16   d1, q14, #8                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          1b                           \n"
+      "b            99f                          \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          25b                          \n"
+      "b            99f                          \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          50b                          \n"
+      "b            99f                          \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "vld1.8       {q1}, [%1]!                  \n"
+      "vld1.8       {q0}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          75b                          \n"
+      "b            99f                          \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          100b                         \n"
+
+      "99:                                       \n"
+      "vst1.8       {d1[7]}, [%0]                \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst_width),         // %3
+        "+r"(source_y_fraction)  // %4
+      :
+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vmov       q2, q1                         \n"  // load next 8 ARGB
+      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
+//  4a:  3e04        subs  r6, #4
+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
+//  50:  ef64 21f4   vorr  q9, q10, q10
+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add
+      "vst2.32    {q0, q1}, [%1]!                \n"
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
+      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
+      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vrshrn.u16 d2, q2, #2                     \n"
+      "vrshrn.u16 d3, q3, #2                     \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "mov        r12, %3, lsl #2                \n"
+      "1:                                        \n"
+      "vld1.32    {d0[0]}, [%0], r12             \n"
+      "vld1.32    {d0[1]}, [%0], r12             \n"
+      "vld1.32    {d1[0]}, [%0], r12             \n"
+      "vld1.32    {d1[1]}, [%0], r12             \n"
+      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      : "r"(src_stepx)   // %3
+      : "memory", "cc", "r12", "q0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "mov        r12, %4, lsl #2                \n"
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8     {d1}, [%1], r12                \n"
+      "vld1.8     {d2}, [%0], r12                \n"
+      "vld1.8     {d3}, [%1], r12                \n"
+      "vld1.8     {d4}, [%0], r12                \n"
+      "vld1.8     {d5}, [%1], r12                \n"
+      "vld1.8     {d6}, [%0], r12                \n"
+      "vld1.8     {d7}, [%1], r12                \n"
+      "vaddl.u8   q0, d0, d1                     \n"
+      "vaddl.u8   q1, d2, d3                     \n"
+      "vaddl.u8   q2, d4, d5                     \n"
+      "vaddl.u8   q3, d6, d7                     \n"
+      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(src_stepx)     // %4
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  int tmp;
+  const uint8_t* src_tmp = src_argb;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(d0, 0)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x),          // %3
+        "+r"(dx),         // %4
+        "=&r"(tmp),       // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
+  "lsr        %5, %3, #16                                \n" \
+  "add        %6, %1, %5, lsl #2                         \n" \
+  "add        %3, %3, %4                                 \n" \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_argb;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_neon64.cc b/media/libyuv/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000000..494a9cfbfb
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1064 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+      "uaddlp     v1.8h, v1.16b                  \n"
+      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
+      "uadalp     v1.8h, v3.16b                  \n"
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn2     v0.16b, v1.8h, #2              \n"
+      "st1        {v0.16b}, [%2], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "st1     {v2.8b}, [%1], #8                 \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
+      "ld1     {v1.16b}, [%2], #16               \n"
+      "ld1     {v2.16b}, [%3], #16               \n"
+      "ld1     {v3.16b}, [%4], #16               \n"
+      "subs    %w5, %w5, #4                      \n"
+      "uaddlp  v0.8h, v0.16b                     \n"
+      "uadalp  v0.8h, v1.16b                     \n"
+      "uadalp  v0.8h, v2.16b                     \n"
+      "uadalp  v0.8h, v3.16b                     \n"
+      "addp    v0.8h, v0.8h, v0.8h               \n"
+      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
+      "st1    {v0.s}[0], [%1], #4                \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_ptr1),  // %2
+        "+r"(src_ptr2),  // %3
+        "+r"(src_ptr3),  // %4
+        "+r"(dst_width)  // %5
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "subs      %w2, %w2, #24                           \n"
+      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "ushll     v16.8h, v4.8b, #0                       \n"
+      "ushll     v17.8h, v5.8b, #0                       \n"
+      "ushll     v18.8h, v6.8b, #0                       \n"
+      "ushll     v19.8h, v7.8b, #0                       \n"
+
+      // 3 * line_0 + line_1
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "umlal     v17.8h, v1.8b, v20.8b                   \n"
+      "umlal     v18.8h, v2.8b, v20.8b                   \n"
+      "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+      // (3 * line_0 + line_1) >> 2
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+      "uqrshrn   v1.8b, v17.8h, #2                       \n"
+      "uqrshrn   v2.8b, v18.8h, #2                       \n"
+      "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v16.8h, v1.8b, #0                       \n"
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v16.8h, v2.8b, #0                       \n"
+      "umlal     v16.8h, v3.8b, v20.8b                   \n"
+      "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "memory", "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
+      // average src line 0 with src line 1
+      "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+      "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+      "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+      "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v4.8h, v1.8b, #0                        \n"
+      "umlal     v4.8h, v0.8b, v20.8b                    \n"
+      "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v4.8h, v2.8b, #0                        \n"
+      "umlal     v4.8h, v3.8b, v20.8b                    \n"
+      "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
+}
+
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                                34, 6,  22, 35, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "ld1       {v3.16b}, [%3]                          \n"
+      "1:                                                \n"
+      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
+      "subs      %w2, %w2, #12                           \n"
+      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
+      "st1       {v2.8b}, [%1], #8                       \n"
+      "st1       {v2.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile(
+      "ld1       {v29.8h}, [%5]                          \n"
+      "ld1       {v30.16b}, [%6]                         \n"
+      "ld1       {v31.8h}, [%7]                          \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
+      "subs      %w4, %w4, #12                           \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v20.8b, v0.8b, v1.8b                    \n"
+      "trn2      v21.8b, v0.8b, v1.8b                    \n"
+      "trn1      v22.8b, v4.8b, v5.8b                    \n"
+      "trn2      v23.8b, v4.8b, v5.8b                    \n"
+      "trn1      v24.8b, v16.8b, v17.8b                  \n"
+      "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+      "trn1      v16.8b, v18.8b, v19.8b                  \n"
+      "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v20.4h, v20.8b                          \n"
+      "uaddlp    v21.4h, v21.8b                          \n"
+      "uaddlp    v22.4h, v22.8b                          \n"
+      "uaddlp    v23.4h, v23.8b                          \n"
+      "uaddlp    v24.4h, v24.8b                          \n"
+      "uaddlp    v25.4h, v25.8b                          \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
+
+      // combine source lines
+      "add       v20.4h, v20.4h, v22.4h                  \n"
+      "add       v21.4h, v21.4h, v23.4h                  \n"
+      "add       v20.4h, v20.4h, v24.4h                  \n"
+      "add       v21.4h, v21.4h, v25.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
+      "add       v2.4h, v2.4h, v17.4h                    \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+      "xtn       v2.8b,  v2.8h                           \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "ushll     v16.8h, v16.8b, #0                      \n"
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+      // combine source lines
+      "add       v0.8h, v0.8h, v16.8h                    \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
+
+      // 0+1+2, 3+4+5
+      "add       v20.8h, v20.8h, v0.8h                   \n"
+      "add       v21.8h, v21.8h, v4.8h                   \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+      // Align for table lookup, vtbl requires registers to be adjacent
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(src_ptr1),        // %3
+        "+r"(dst_width)        // %4
+      : "r"(&kMult38_Div6),    // %5
+        "r"(&kShuf38_2),       // %6
+        "r"(&kMult38_Div9)     // %7
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+        "memory", "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile(
+      "ld1       {v30.8h}, [%4]                          \n"
+      "ld1       {v31.16b}, [%5]                         \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "subs      %w3, %w3, #12                           \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v16.8b, v0.8b, v1.8b                    \n"
+      "trn2      v17.8b, v0.8b, v1.8b                    \n"
+      "trn1      v18.8b, v4.8b, v5.8b                    \n"
+      "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v16.4h, v16.8b                          \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
+      "uaddlp    v18.4h, v18.8b                          \n"
+      "uaddlp    v19.4h, v19.8b                          \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
+
+      // combine source lines
+      "add       v16.4h, v16.4h, v18.4h                  \n"
+      "add       v17.4h, v17.4h, v19.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+      // combine source lines
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
+
+      // 0+1+2, 3+4+5
+      "add       v16.8h, v16.8h, v0.8h                   \n"
+      "add       v17.8h, v17.8h, v4.8h                   \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(dst_width)        // %3
+      : "r"(&kMult38_Div6),    // %4
+        "r"(&kShuf38_2)        // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v30", "v31", "memory", "cc");
+}
+
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       w12, %w5                        \n"
+      "eor       v2.16b, v2.16b, v2.16b          \n"
+      "eor       v3.16b, v3.16b, v3.16b          \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "ld1       {v0.16b}, [%0], %3              \n"
+      "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+      "uaddw     v2.8h, v2.8h, v0.8b             \n"
+      "subs      w12, w12, #1                    \n"
+      "b.gt      2b                              \n"
+      "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+      "add      %1, %1, #16                      \n"
+      "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+      "b.gt     1b                               \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_ptr;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "rshrn     v6.4h, v16.4s, #16              \n"
+    "rshrn2    v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y_fraction = 256 - source_y_fraction;
+  asm volatile(
+      "cmp          %w4, #0                      \n"
+      "b.eq         100f                         \n"
+      "add          %2, %2, %1                   \n"
+      "cmp          %w4, #64                     \n"
+      "b.eq         75f                          \n"
+      "cmp          %w4, #128                    \n"
+      "b.eq         50f                          \n"
+      "cmp          %w4, #192                    \n"
+      "b.eq         25f                          \n"
+
+      "dup          v5.8b, %w4                   \n"
+      "dup          v4.8b, %w5                   \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "umull        v6.8h, v0.8b, v4.8b          \n"
+      "umull2       v7.8h, v0.16b, v4.16b        \n"
+      "umlal        v6.8h, v1.8b, v5.8b          \n"
+      "umlal2       v7.8h, v1.16b, v5.16b        \n"
+      "rshrn        v0.8b, v6.8h, #8             \n"
+      "rshrn2       v0.16b, v7.8h, #8            \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         1b                           \n"
+      "b            99f                          \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         25b                          \n"
+      "b            99f                          \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         50b                          \n"
+      "b            99f                          \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "ld1          {v1.16b}, [%1], #16          \n"
+      "ld1          {v0.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         75b                          \n"
+      "b            99f                          \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         100b                         \n"
+
+      "99:                                       \n"
+      "st1          {v0.b}[15], [%0]             \n"
+      : "+r"(dst_ptr),            // %0
+        "+r"(src_ptr),            // %1
+        "+r"(src_stride),         // %2
+        "+r"(dst_width),          // %3
+        "+r"(source_y_fraction),  // %4
+        "+r"(y_fraction)          // %5
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "mov        v2.16b, v3.16b                 \n"
+      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "urhadd     v1.16b, v2.16b, v3.16b         \n"
+      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn      v1.8b, v1.8h, #2               \n"
+      "rshrn      v2.8b, v2.8h, #2               \n"
+      "rshrn      v3.8b, v3.8h, #2               \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.s}[0], [%0], %3            \n"
+      "ld1        {v0.s}[1], [%0], %3            \n"
+      "ld1        {v0.s}[2], [%0], %3            \n"
+      "ld1        {v0.s}[3], [%0], %3            \n"
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
+      : "r"((int64_t)(src_stepx * 4))  // %3
+      : "memory", "cc", "v0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
+      "ld1        {v1.8b}, [%1], %4              \n"
+      "ld1        {v2.8b}, [%0], %4              \n"
+      "ld1        {v3.8b}, [%1], %4              \n"
+      "ld1        {v4.8b}, [%0], %4              \n"
+      "ld1        {v5.8b}, [%1], %4              \n"
+      "ld1        {v6.8b}, [%0], %4              \n"
+      "ld1        {v7.8b}, [%1], %4              \n"
+      "uaddl      v0.8h, v0.8b, v1.8b            \n"
+      "uaddl      v2.8h, v2.8b, v3.8b            \n"
+      "uaddl      v4.8h, v4.8b, v5.8b            \n"
+      "uaddl      v6.8h, v6.8b, v7.8b            \n"
+      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+      "mov        v0.d[1], v2.d[0]               \n"
+      "mov        v2.d[0], v16.d[1]              \n"
+      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+      "mov        v4.d[1], v6.d[0]               \n"
+      "mov        v6.d[0], v16.d[1]              \n"
+      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+      "st1     {v0.16b}, [%2], #16               \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
+      : "r"((int64_t)(src_stepx * 4))  // %4
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  int64_t tmp64;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(v0, 0)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      // clang-format on
+      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x64),        // %3
+        "+r"(dx64),       // %4
+        "=&r"(tmp64),     // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
+  "lsr        %5, %3, #16                           \n" \
+  "add        %6, %1, %5, lsl #2                    \n" \
+  "add        %3, %3, %4                            \n" \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "1:                                        \n"
+      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
+      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
+      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
+      "uaddlp     v1.4s, v1.8h                   \n"
+      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
+      "uadalp     v1.4s, v3.8h                   \n"
+      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
+      "rshrn2     v0.8h, v1.4s, #2               \n"
+      "st1        {v0.8h}, [%2], #16             \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  asm volatile(
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "movi       v0.8h, #9                      \n"  // constants
+      "movi       v1.4s, #3                      \n"
+
+      "1:                                        \n"
+      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
+      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
+      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
+      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
+      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
+      "umull      v16.4s, v3.4h, v0.4h           \n"
+      "umull2     v7.4s, v3.8h, v0.8h            \n"
+      "umull      v18.4s, v4.4h, v0.4h           \n"
+      "umull2     v17.4s, v4.8h, v0.8h           \n"
+      "uaddw      v16.4s, v16.4s, v6.4h          \n"
+      "uaddl2     v19.4s, v6.8h, v3.8h           \n"
+      "uaddl      v3.4s, v6.4h, v3.4h            \n"
+      "uaddw2     v6.4s, v7.4s, v6.8h            \n"
+      "uaddl2     v7.4s, v5.8h, v4.8h            \n"
+      "uaddl      v4.4s, v5.4h, v4.4h            \n"
+      "uaddw      v18.4s, v18.4s, v5.4h          \n"
+      "mla        v16.4s, v4.4s, v1.4s           \n"
+      "mla        v18.4s, v3.4s, v1.4s           \n"
+      "mla        v6.4s, v7.4s, v1.4s            \n"
+      "uaddw2     v4.4s, v17.4s, v5.8h           \n"
+      "uqrshrn    v16.4h,  v16.4s, #4            \n"
+      "mla        v4.4s, v19.4s, v1.4s           \n"
+      "uqrshrn2   v16.8h, v6.4s, #4              \n"
+      "uqrshrn    v17.4h, v18.4s, #4             \n"
+      "uqrshrn2   v17.8h, v4.4s, #4              \n"
+      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      : "r"(2LL),          // %4
+        "r"(14LL)          // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19"  // Clobber List
+      );
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/scale_win.cc b/media/libyuv/libyuv/source/scale_win.cc
new file mode 100644
index 0000000000..c5fc86f3e9
--- /dev/null
+++ b/media/libyuv/libyuv/source/scale_win.cc
@@ -0,0 +1,1391 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8          // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_ptr,
+                                                 int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5  // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pavgw      xmm0, xmm5       // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5  // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // vertical add
+    paddw      xmm1, xmm3
+    psrlw      xmm0, 1
+    psrlw      xmm1, 1
+    pavgw      xmm0, xmm5  // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                uint8_t* dst_ptr,
+                                                int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5  // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]  // src_ptr
+    mov         esi, [esp + 4 + 8]  // src_stride
+    mov         edx, [esp + 4 + 12]  // dst_ptr
+    mov         ecx, [esp + 4 + 16]  // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5  // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // vertical add
+    vpaddw      ymm1, ymm1, ymm3
+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm1, ymm1, 1
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]  // src_ptr
+    mov        esi, [esp + 8 + 8]  // src_stride
+    mov        edx, [esp + 8 + 12]  // dst_ptr
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    movdqa     xmm5, xmm4
+    packuswb   xmm4, xmm4
+    psllw      xmm5, 3  // constant 0x0008
+
+  wloop:
+    movdqu     xmm0, [eax]  // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // vertical add rows 0, 1
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // add row 2
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + edi]
+    movdqu     xmm3, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // add row 3
+    paddw      xmm1, xmm3
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // + 8 for round
+    psrlw      xmm0, 4  // /16 for average of 4 * 4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]  // src_ptr
+    mov         esi, [esp + 8 + 8]  // src_stride
+    mov         edx, [esp + 8 + 12]  // dst_ptr
+    mov         ecx, [esp + 8 + 16]  // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
+    vpsrlw      ymm4, ymm4, 15
+    vpsllw      ymm5, ymm4, 3  // constant 0x0008
+    vpackuswb   ymm4, ymm4, ymm4
+
+  wloop:
+    vmovdqu     ymm0, [eax]  // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // add row 2
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + edi]
+    vmovdqu     ymm3, [eax + edi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // add row 3
+    vpaddw      ymm1, ymm1, ymm3
+    vphaddw     ymm0, ymm0, ymm1  // mutates
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    movdqa     xmm3, xmmword ptr kShuf0
+    movdqa     xmm4, xmmword ptr kShuf1
+    movdqa     xmm5, xmmword ptr kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]  // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]  // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    movdqa     xmm4, xmmword ptr kShuf38a
+    movdqa     xmm5, xmmword ptr kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0       // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShufAc
+    movdqa     xmm3, xmmword ptr kShufAc3
+    movdqa     xmm4, xmmword ptr kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6  // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShufAb0
+    movdqa     xmm3, xmmword ptr kShufAb1
+    movdqa     xmm4, xmmword ptr kShufAb2
+    movdqa     xmm5, xmmword ptr kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]  // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1  // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_ptr
+    mov        edx, [esp + 8]  // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+        // sum rows
+  xloop:
+    movdqu     xmm3, [eax]  // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]  // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2  // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0  // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_ptr
+    mov         edx, [esp + 8]  // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+        // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]  // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0  // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                                             const uint8_t* src_ptr,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]  // dst_ptr
+    mov        esi, [esp + 12 + 8]  // src_ptr
+    mov        ecx, [esp + 12 + 12]  // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pcmpeqb    xmm7, xmm7  // generate 0x0001
+    psrlw      xmm7, 15
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9  // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5  // 0011
+    punpcklwd  xmm0, xmm4
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
+    movd       ebx, xmm1
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2  // 2 pixels
+    jge        xloop2
+
+ xloop29:
+    add        ecx, 2 - 1
+    jl         xloop99
+
+            // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9  // 7 bit fractions.
+    pshufb     xmm2, xmm5  // 0011
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0  // 16 bit
+    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2  // 8 bits
+    movd       ebx, xmm2
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                                         const uint8_t* src_ptr,
+                                         int dst_width,
+                                         int x,
+                                         int dx) {
+  __asm {
+    mov        edx, [esp + 4]  // dst_ptr
+    mov        eax, [esp + 8]  // src_ptr
+    mov        ecx, [esp + 12]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_argb,
+                                              int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2  // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 int src_stepx,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    // src_stride ignored
+    mov        ebx, [esp + 8 + 12]  // src_stepx
+    mov        edx, [esp + 8 + 16]  // dst_argb
+    mov        ecx, [esp + 8 + 20]  // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    int src_stepx,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        esi, [esp + 12 + 8]  // src_stride
+    mov        ebx, [esp + 12 + 12]  // src_stepx
+    mov        edx, [esp + 12 + 16]  // dst_argb
+    mov        ecx, [esp + 12 + 20]  // dst_width
+    lea        esi, [eax + esi]  // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2  // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                                          const uint8_t* src_argb,
+                                          int dst_width,
+                                          int x,
+                                          int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0  // x3 x2 x1 x0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1  // get x0 integer.
+    pextrw     edx, xmm2, 3  // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+        // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    pextrw     edx, xmm2, 7  // get x3 integer.
+    paddd      xmm2, xmm3  // x += dx
+    punpckldq  xmm0, xmm1  // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4  // x2 x3
+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4  // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+        // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    punpckldq  xmm0, xmm1  // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+        // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                                                 const uint8_t* src_argb,
+                                                 int dst_width,
+                                                 int x,
+                                                 int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, xmmword ptr kShuffleColARGB
+    movdqa     xmm5, xmmword ptr kShuffleFractions
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9  // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5  // 0000000011111111
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2  // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+            // 1 pixel remainder
+    psrlw      xmm2, 9  // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5  // 00000000
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                                             const uint8_t* src_argb,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
+  __asm {
+    mov        edx, [esp + 4]  // dst_argb
+    mov        eax, [esp + 8]  // src_argb
+    mov        ecx, [esp + 12]  // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]  // num
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]  // num
+    mov        ecx, [esp + 8]  // denom
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libyuv/libyuv/source/video_common.cc b/media/libyuv/libyuv/source/video_common.cc
new file mode 100644
index 0000000000..92384c050c
--- /dev/null
+++ b/media/libyuv/libyuv/source/video_common.cc
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+struct FourCCAliasEntry {
+  uint32_t alias;
+  uint32_t canonical;
+};
+
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+    {FOURCC_IYUV, FOURCC_I420},
+    {FOURCC_YU12, FOURCC_I420},
+    {FOURCC_YU16, FOURCC_I422},
+    {FOURCC_YU24, FOURCC_I444},
+    {FOURCC_YUYV, FOURCC_YUY2},
+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+    {FOURCC_HDYC, FOURCC_UYVY},
+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+    {FOURCC_DMB1, FOURCC_MJPG},
+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+    {FOURCC_RGB3, FOURCC_RAW},
+    {FOURCC_BGR3, FOURCC_24BG},
+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32_t CanonicalFourCC(uint32_t fourcc) {
+  int i;
+  for (i = 0; i < NUM_ALIASES; ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif