1 files changed, 1374 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000000..312236d2df
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1374 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "psrlw      $0x1,%%xmm0                    \n"
+      "psrlw      $0x1,%%xmm1                    \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrld     $0x18,%%xmm5                    \n"
+      "pslld     $0x10,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stridex3;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                  \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "psllw      $0x3,%%xmm5                    \n"
+      "lea       0x00(%4,%4,2),%3                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "phaddw     %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm5,%%xmm0                  \n"
+      "psrlw      $0x4,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %0,%%xmm3                       \n"
+      "movdqa    %1,%%xmm4                       \n"
+      "movdqa    %2,%%xmm5                       \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm2                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "palignr   $0x8,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movd      %%xmm1,0x8(%1)                  \n"
+      "lea       0xc(%1),%1                      \n"
+      "sub       $0xc,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "movdqa    %3,%%xmm5                       \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pshufb    %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "paddusw   %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "paddusw   %%xmm0,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,(%1)                     \n"
+      "psrlq     $0x10,%%xmm1                    \n"
+      "movd      %%xmm1,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "pshufb    %%xmm3,%%xmm7                   \n"
+      "paddusw   %%xmm7,%%xmm6                   \n"
+      "pmulhuw   %%xmm4,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movd      %%xmm6,(%1)                     \n"
+      "psrlq     $0x10,%%xmm6                    \n"
+      "movd      %%xmm6,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x10(%1),%%xmm1                 \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "paddusw   %%xmm2,%%xmm0                   \n"
+      "paddusw   %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm3                    \n"
+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  intptr_t x0, x1, temp_pixel;
+  asm volatile(
+      "movd      %6,%%xmm2                       \n"
+      "movd      %7,%%xmm3                       \n"
+      "movl      $0x04040000,%k2                 \n"
+      "movd      %k2,%%xmm5                      \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "subl      $0x2,%5                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movzwl    0x00(%1,%4,1),%k2               \n"
+      "movd      %k2,%%xmm4                      \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "punpcklwd %%xmm4,%%xmm0                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb   %%xmm7,%%xmm1                   \n"
+      "pmaddubsw %%xmm0,%%xmm1                   \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,%k2                      \n"
+      "mov       %w2,(%0)                        \n"
+      "lea       0x2(%0),%0                      \n"
+      "subl      $0x2,%5                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "addl      $0x1,%5                         \n"
+      "jl        99f                             \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "paddusb   %%xmm7,%%xmm2                   \n"
+      "pmaddubsw %%xmm0,%%xmm2                   \n"
+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movd      %%xmm2,%k2                      \n"
+      "mov       %b2,(%0)                        \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
+#if defined(__x86_64__)
+        "+rm"(dst_width)  // %5
+#else
+        "+m"(dst_width)  // %5
+#endif
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
+#if defined(__x86_64__)
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
+#else
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  (void)src_stride;
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd      (%0),%%xmm0                     \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%0,%1,2),%%xmm2            \n"
+      "movd      0x00(%0,%4,1),%%xmm3            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "punpckldq %%xmm3,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+      "lea       0x00(%0,%5,1),%5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movhps    0x00(%0,%1,1),%%xmm0            \n"
+      "movq      0x00(%0,%1,2),%%xmm1            \n"
+      "movhps    0x00(%0,%4,1),%%xmm1            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "movq      (%5),%%xmm2                     \n"
+      "movhps    0x00(%5,%1,1),%%xmm2            \n"
+      "movq      0x00(%5,%1,2),%%xmm3            \n"
+      "movhps    0x00(%5,%4,1),%%xmm3            \n"
+      "lea       0x00(%5,%1,4),%5                \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "cmp       $0x0,%4                         \n"
+      "jl        99f                             \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
+
+      LABELALIGN
+      "40:                                       \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "pextrw    $0x7,%%xmm2,%k1                 \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%3,%0,4),%%xmm1            \n"
+      "movd      0x00(%3,%1,4),%%xmm4            \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "punpckldq %%xmm4,%%xmm1                   \n"
+      "punpcklqdq %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "test      $0x2,%4                         \n"
+      "je        29f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%2)                     \n"
+      "lea       0x8(%2),%2                      \n"
+      "29:                                       \n"
+      "test      $0x1,%4                         \n"
+      "je        99f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpckldq %%xmm0,%%xmm0                   \n"
+      "punpckhdq %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movdqa    %0,%%xmm4                       \n"
+      "movdqa    %1,%%xmm5                       \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+      );
+
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "sub       $0x2,%2                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movhps    0x00(%1,%4,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm1                   \n"
+      "pmaddubsw %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%0)                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x2,%2                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "add       $0x1,%2                         \n"
+      "jl        99f                             \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%0)                     \n"
+
+      LABELALIGN "99:                            \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "sub       $0x10001,%%eax                  \n"
+      "sbb       $0x0,%%edx                      \n"
+      "sub       $0x1,%1                         \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif