Adding upstream version 115.7.0esr.upstream/115.7.0esr upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /media/libjpeg/simd
parent: Initial commit. (diff)
download: firefox-esr-upstream.tar.xz
firefox-esr-upstream.zip
166 files changed, 69956 insertions, 0 deletions
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..04d64526fb
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,976 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use Neon regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+    /* Store nbits. */
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..358e1597b1
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1053 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+#define JSIMD_FASTLD3  1
+#define JSIMD_FASTST3  2
+#define JSIMD_FASTTBL  4
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 |
+                                                 JSIMD_FASTST3 | JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+  char *p;
+
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+  simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTST3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+  if (simd_features & JSIMD_FASTTBL)
+#endif
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+#ifndef NEON_INTRINSICS
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
new file mode 100644
index 0000000000..738a4f0658
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -0,0 +1,2254 @@
+/*
+ * Armv8 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
+.text
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
+
+asm_function jsimd_idct_islow_neon
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x1
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
+
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+    sub             sp, sp, #64
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
+    ld1             {v0.8h, v1.8h}, [x15]
+    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
+
+    cmeq            v16.8h, v3.8h, #0
+    cmeq            v26.8h, v4.8h, #0
+    cmeq            v27.8h, v5.8h, #0
+    cmeq            v28.8h, v6.8h, #0
+    cmeq            v29.8h, v7.8h, #0
+    cmeq            v30.8h, v8.8h, #0
+    cmeq            v31.8h, v9.8h, #0
+
+    and             v10.16b, v16.16b, v26.16b
+    and             v11.16b, v27.16b, v28.16b
+    and             v12.16b, v29.16b, v30.16b
+    and             v13.16b, v31.16b, v10.16b
+    and             v14.16b, v11.16b, v12.16b
+    mul             v2.8h, v2.8h, v18.8h
+    and             v15.16b, v13.16b, v14.16b
+    shl             v10.8h, v2.8h, #(PASS1_BITS)
+    sqxtn           v16.8b, v15.8h
+    mov             TMP1, v16.d[0]
+    mvn             TMP2, TMP1
+
+    cbnz            TMP2, 2f
+    /* case all AC coeffs are zeros */
+    dup             v2.2d, v10.d[0]
+    dup             v6.2d, v10.d[1]
+    mov             v3.16b, v2.16b
+    mov             v7.16b, v6.16b
+    mov             v4.16b, v2.16b
+    mov             v8.16b, v6.16b
+    mov             v5.16b, v2.16b
+    mov             v9.16b, v6.16b
+1:
+    /* for this transpose, we should organise data like this:
+     * 00, 01, 02, 03, 40, 41, 42, 43
+     * 10, 11, 12, 13, 50, 51, 52, 53
+     * 20, 21, 22, 23, 60, 61, 62, 63
+     * 30, 31, 32, 33, 70, 71, 72, 73
+     * 04, 05, 06, 07, 44, 45, 46, 47
+     * 14, 15, 16, 17, 54, 55, 56, 57
+     * 24, 25, 26, 27, 64, 65, 66, 67
+     * 34, 35, 36, 37, 74, 75, 76, 77
+     */
+    trn1            v28.8h, v2.8h, v3.8h
+    trn1            v29.8h, v4.8h, v5.8h
+    trn1            v30.8h, v6.8h, v7.8h
+    trn1            v31.8h, v8.8h, v9.8h
+    trn2            v16.8h, v2.8h, v3.8h
+    trn2            v17.8h, v4.8h, v5.8h
+    trn2            v18.8h, v6.8h, v7.8h
+    trn2            v19.8h, v8.8h, v9.8h
+    trn1            v2.4s, v28.4s, v29.4s
+    trn1            v6.4s, v30.4s, v31.4s
+    trn1            v3.4s, v16.4s, v17.4s
+    trn1            v7.4s, v18.4s, v19.4s
+    trn2            v4.4s, v28.4s, v29.4s
+    trn2            v8.4s, v30.4s, v31.4s
+    trn2            v5.4s, v16.4s, v17.4s
+    trn2            v9.4s, v18.4s, v19.4s
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    movi            v0.16b, #(CENTERJSAMPLE)
+    /* Prepare pointers (dual-issue with Neon instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP1, TMP1, OUTPUT_COL
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP2, TMP2, OUTPUT_COL
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP3, TMP3, OUTPUT_COL
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP4, TMP4, OUTPUT_COL
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
+    /* Transpose the final 8-bit samples */
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
+    /* Store results to the output buffer */
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    blr             x30
+
+.balign 16
+2:
+    mul             v3.8h, v3.8h, v19.8h
+    mul             v4.8h, v4.8h, v20.8h
+    mul             v5.8h, v5.8h, v21.8h
+    add             TMP4, xzr, TMP2, LSL #32
+    mul             v6.8h, v6.8h, v22.8h
+    mul             v7.8h, v7.8h, v23.8h
+    adds            TMP3, xzr, TMP2, LSR #32
+    mul             v8.8h, v8.8h, v24.8h
+    mul             v9.8h, v9.8h, v25.8h
+    b.ne            3f
+    /* Right AC coef is zero */
+    dup             v15.2d, v10.d[1]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    mov             v6.16b, v15.16b
+    mov             v7.16b, v15.16b
+    mov             v8.16b, v15.16b
+    mov             v9.16b, v15.16b
+    b               1b
+
+.balign 16
+3:
+    cbnz            TMP4, 4f
+    /* Left AC coef is zero */
+    dup             v14.2d, v10.d[0]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    mov             v2.16b, v14.16b
+    mov             v3.16b, v14.16b
+    mov             v4.16b, v14.16b
+    mov             v5.16b, v14.16b
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+.balign 16
+4:
+    /* "No" AC coef is zero */
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
+
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro do_load size
+  .if \size == 8
+    ld1             {v4.8b}, [U], 8
+    ld1             {v5.8b}, [V], 8
+    ld1             {v0.8b}, [Y], 8
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+  .elseif \size == 4
+    ld1             {v4.b}[0], [U], 1
+    ld1             {v4.b}[1], [U], 1
+    ld1             {v4.b}[2], [U], 1
+    ld1             {v4.b}[3], [U], 1
+    ld1             {v5.b}[0], [V], 1
+    ld1             {v5.b}[1], [V], 1
+    ld1             {v5.b}[2], [V], 1
+    ld1             {v5.b}[3], [V], 1
+    ld1             {v0.b}[0], [Y], 1
+    ld1             {v0.b}[1], [Y], 1
+    ld1             {v0.b}[2], [Y], 1
+    ld1             {v0.b}[3], [Y], 1
+  .elseif \size == 2
+    ld1             {v4.b}[4], [U], 1
+    ld1             {v4.b}[5], [U], 1
+    ld1             {v5.b}[4], [V], 1
+    ld1             {v5.b}[5], [V], 1
+    ld1             {v0.b}[4], [Y], 1
+    ld1             {v0.b}[5], [Y], 1
+  .elseif \size == 1
+    ld1             {v4.b}[6], [U], 1
+    ld1             {v5.b}[6], [V], 1
+    ld1             {v0.b}[6], [Y], 1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_st3 == 1
+        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      .else
+        st1         {v10.b}[0], [RGB], #1
+        st1         {v11.b}[0], [RGB], #1
+        st1         {v12.b}[0], [RGB], #1
+
+        st1         {v10.b}[1], [RGB], #1
+        st1         {v11.b}[1], [RGB], #1
+        st1         {v12.b}[1], [RGB], #1
+
+        st1         {v10.b}[2], [RGB], #1
+        st1         {v11.b}[2], [RGB], #1
+        st1         {v12.b}[2], [RGB], #1
+
+        st1         {v10.b}[3], [RGB], #1
+        st1         {v11.b}[3], [RGB], #1
+        st1         {v12.b}[3], [RGB], #1
+
+        st1         {v10.b}[4], [RGB], #1
+        st1         {v11.b}[4], [RGB], #1
+        st1         {v12.b}[4], [RGB], #1
+
+        st1         {v10.b}[5], [RGB], #1
+        st1         {v11.b}[5], [RGB], #1
+        st1         {v12.b}[5], [RGB], #1
+
+        st1         {v10.b}[6], [RGB], #1
+        st1         {v11.b}[6], [RGB], #1
+        st1         {v12.b}[6], [RGB], #1
+
+        st1         {v10.b}[7], [RGB], #1
+        st1         {v11.b}[7], [RGB], #1
+        st1         {v12.b}[7], [RGB], #1
+      .endif
+    .elseif \size == 4
+      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
+    .elseif \size == 2
+      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
+    .elseif \size == 1
+      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
+    .else
+     .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+    .elseif \size == 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+    .elseif \size == 2
+      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+    .elseif \size == 1
+      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 16
+    .if \size == 8
+      st1           {v25.8h}, [RGB], 16
+    .elseif \size == 4
+      st1           {v25.4h}, [RGB], 8
+    .elseif \size == 2
+      st1           {v25.h}[4], [RGB], 2
+      st1           {v25.h}[5], [RGB], 2
+    .elseif \size == 1
+      st1           {v25.h}[6], [RGB], 2
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+                                           g_offs, gsize, b_offs, bsize, \
+                                           defsize, fast_st3
+
+/*
+ * 2-stage pipelined YCbCr->RGB conversion
+ */
+
+.macro do_yuv_to_rgb_stage1
+    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb_stage2
+    rshrn           v20.4h, v20.4s, #15
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16
+    sqxtun          v1\g_offs\defsize, v20.8h
+    sqxtun          v1\r_offs\defsize, v24.8h
+    sqxtun          v1\b_offs\defsize, v28.8h
+  .else
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    sri             v25.8h, v21.8h, #5
+    sri             v25.8h, v29.8h, #11
+  .endif
+.endm
+
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+    rshrn           v20.4h, v20.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    ld1             {v4.8b}, [U], 8
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    ld1             {v5.8b}, [V], 8
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+    sqxtun          v1\g_offs\defsize, v20.8h
+    ld1             {v0.8b}, [Y], 8
+    sqxtun          v1\r_offs\defsize, v24.8h
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sqxtun          v1\b_offs\defsize, v28.8h
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+  .else  /**************************** rgb565 ********************************/
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1             {v0.8b}, [Y], 8
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    sri             v25.8h, v21.8h, #5
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sri             v25.8h, v29.8h, #11
+  .endif
+    do_store        \bpp, 8, \fast_st3
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb
+    do_yuv_to_rgb_stage1
+    do_yuv_to_rgb_stage2
+.endm
+
+.if \fast_st3 == 1
+asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    INPUT_ROW       .req w2
+    OUTPUT_BUF      .req x3
+    NUM_ROWS        .req w4
+
+    INPUT_BUF0      .req x5
+    INPUT_BUF1      .req x6
+    INPUT_BUF2      .req x1
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w15
+
+    sub             sp, sp, 64
+    mov             x9, sp
+
+    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
+
+    /* Save Neon registers */
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+    ld1             {v0.4h, v1.4h}, [x15], 16
+    ld1             {v2.8h}, [x15]
+
+    ldr             INPUT_BUF0, [INPUT_BUF]
+    ldr             INPUT_BUF1, [INPUT_BUF, #8]
+    ldr             INPUT_BUF2, [INPUT_BUF, #16]
+    .unreq          INPUT_BUF
+
+    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
+    movi            v10.16b, #255
+    movi            v13.16b, #255
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
+    add             INPUT_ROW, INPUT_ROW, #1
+    ldr             RGB, [OUTPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         8
+    do_yuv_to_rgb_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_yuv_to_rgb_stage2
+    do_store        \bpp, 8, \fast_st3
+    tst             N, #7
+    b.eq            8f
+3:
+    tst             N, #4
+    b.eq            3f
+    do_load         4
+3:
+    tst             N, #2
+    b.eq            4f
+    do_load         2
+4:
+    tst             N, #1
+    b.eq            5f
+    do_load         1
+5:
+    do_yuv_to_rgb
+    tst             N, #4
+    b.eq            6f
+    do_store        \bpp, 4, \fast_st3
+6:
+    tst             N, #2
+    b.eq            7f
+    do_store        \bpp, 2, \fast_st3
+7:
+    tst             N, #1
+    b.eq            8f
+    do_store        \bpp, 1, \fast_st3
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+    .unreq          OUTPUT_WIDTH
+    .unreq          INPUT_ROW
+    .unreq          OUTPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          INPUT_BUF0
+    .unreq          INPUT_BUF1
+    .unreq          INPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_yuv_to_rgb
+.purgem do_yuv_to_rgb_stage1
+.purgem do_yuv_to_rgb_stage2
+.purgem do_yuv_to_rgb_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+  .elseif \size == 4
+    st1             {v20.b}[0], [Y], #1
+    st1             {v20.b}[1], [Y], #1
+    st1             {v20.b}[2], [Y], #1
+    st1             {v20.b}[3], [Y], #1
+    st1             {v21.b}[0], [U], #1
+    st1             {v21.b}[1], [U], #1
+    st1             {v21.b}[2], [U], #1
+    st1             {v21.b}[3], [U], #1
+    st1             {v22.b}[0], [V], #1
+    st1             {v22.b}[1], [V], #1
+    st1             {v22.b}[2], [V], #1
+    st1             {v22.b}[3], [V], #1
+  .elseif \size == 2
+    st1             {v20.b}[4], [Y], #1
+    st1             {v20.b}[5], [Y], #1
+    st1             {v21.b}[4], [U], #1
+    st1             {v21.b}[5], [U], #1
+    st1             {v22.b}[4], [V], #1
+    st1             {v22.b}[5], [V], #1
+  .elseif \size == 1
+    st1             {v20.b}[6], [Y], #1
+    st1             {v21.b}[6], [U], #1
+    st1             {v22.b}[6], [V], #1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_ld3 == 1
+        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      .else
+        ld1         {v10.b}[0], [RGB], #1
+        ld1         {v11.b}[0], [RGB], #1
+        ld1         {v12.b}[0], [RGB], #1
+
+        ld1         {v10.b}[1], [RGB], #1
+        ld1         {v11.b}[1], [RGB], #1
+        ld1         {v12.b}[1], [RGB], #1
+
+        ld1         {v10.b}[2], [RGB], #1
+        ld1         {v11.b}[2], [RGB], #1
+        ld1         {v12.b}[2], [RGB], #1
+
+        ld1         {v10.b}[3], [RGB], #1
+        ld1         {v11.b}[3], [RGB], #1
+        ld1         {v12.b}[3], [RGB], #1
+
+        ld1         {v10.b}[4], [RGB], #1
+        ld1         {v11.b}[4], [RGB], #1
+        ld1         {v12.b}[4], [RGB], #1
+
+        ld1         {v10.b}[5], [RGB], #1
+        ld1         {v11.b}[5], [RGB], #1
+        ld1         {v12.b}[5], [RGB], #1
+
+        ld1         {v10.b}[6], [RGB], #1
+        ld1         {v11.b}[6], [RGB], #1
+        ld1         {v12.b}[6], [RGB], #1
+
+        ld1         {v10.b}[7], [RGB], #1
+        ld1         {v11.b}[7], [RGB], #1
+        ld1         {v12.b}[7], [RGB], #1
+      .endif
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
+    .elseif \size == 2
+      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
+    .elseif \size == 1
+      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+    .elseif \size == 2
+      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+    .elseif \size == 1
+      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+                                           b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
+    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
+    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
+    rev64           v18.4s, v1.4s
+    rev64           v26.4s, v1.4s
+    rev64           v28.4s, v1.4s
+    rev64           v30.4s, v1.4s
+    umull           v14.4s, v4.4h, v0.h[0]
+    umull2          v16.4s, v4.8h, v0.h[0]
+    umlsl           v18.4s, v4.4h, v0.h[3]
+    umlsl2          v26.4s, v4.8h, v0.h[3]
+    umlal           v28.4s, v4.4h, v0.h[5]
+    umlal2          v30.4s, v4.8h, v0.h[5]
+    umlal           v14.4s, v6.4h, v0.h[1]
+    umlal2          v16.4s, v6.8h, v0.h[1]
+    umlsl           v18.4s, v6.4h, v0.h[4]
+    umlsl2          v26.4s, v6.8h, v0.h[4]
+    umlsl           v28.4s, v6.4h, v0.h[6]
+    umlsl2          v30.4s, v6.8h, v0.h[6]
+    umlal           v14.4s, v8.4h, v0.h[2]
+    umlal2          v16.4s, v8.8h, v0.h[2]
+    umlal           v18.4s, v8.4h, v0.h[5]
+    umlal2          v26.4s, v8.8h, v0.h[5]
+    umlsl           v28.4s, v8.4h, v0.h[7]
+    umlsl2          v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    rshrn           v20.4h, v14.4s, #16
+    shrn            v22.4h, v18.4s, #16
+    shrn            v24.4h, v28.4s, #16
+    rshrn2          v20.8h, v16.4s, #16
+    shrn2           v22.8h, v26.4s, #16
+    shrn2           v24.8h, v30.4s, #16
+    xtn             v20.8b, v20.8h       /* v20 = y */
+    xtn             v21.8b, v22.8h       /* v21 = u */
+    xtn             v22.8b, v24.8h       /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+    do_rgb_to_yuv_stage2
+    do_load         \bpp, 8, \fast_ld3
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+    do_rgb_to_yuv_stage1
+.endm
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_ROW      .req w3
+    NUM_ROWS        .req w4
+
+    OUTPUT_BUF0     .req x5
+    OUTPUT_BUF1     .req x6
+    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w12
+
+    /* Load constants to d0, d1, d2, d3 */
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
+    ld1             {v0.8h, v1.8h}, [x13]
+
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    sub             sp, sp, #64
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         \bpp, 8, \fast_ld3
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    b.eq            8f
+3:
+    tbz             N, #2, 3f
+    do_load         \bpp, 4, \fast_ld3
+3:
+    tbz             N, #1, 4f
+    do_load         \bpp, 2, \fast_ld3
+4:
+    tbz             N, #0, 5f
+    do_load         \bpp, 1, \fast_ld3
+5:
+    do_rgb_to_yuv
+    tbz             N, #2, 6f
+    do_store        4
+6:
+    tbz             N, #1, 7f
+    do_store        2
+7:
+    tbz             N, #0, 8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
+    ld1             {v0.8h, v1.8h}, [TMP]
+
+    /* Save Neon registers */
+    sub             sp, sp, #64
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
+
+    /* Load all DATA into Neon registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v16.8h
+     *   1 | d18     | d19    | v17.8h
+     *   2 | d20     | d21    | v18.8h
+     *   3 | d22     | d23    | v19.8h
+     *   4 | d24     | d25    | v20.8h
+     *   5 | d26     | d27    | v21.8h
+     *   6 | d28     | d29    | v22.8h
+     *   7 | d30     | d31    | v23.8h
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub             DATA, DATA, #64
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P1
+    rshrn           v22.4h, v22.4s, #DESCALE_P1
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P1
+    rshrn           v21.4h, v29.4s, #DESCALE_P1
+    rshrn           v19.4h, v30.4s, #DESCALE_P1
+    rshrn           v17.4h, v31.4s, #DESCALE_P1
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P2
+    rshrn           v22.4h, v22.4s, #DESCALE_P2
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P2
+    rshrn           v21.4h, v29.4s, #DESCALE_P2
+    rshrn           v19.4h, v30.4s, #DESCALE_P2
+    rshrn           v17.4h, v31.4s, #DESCALE_P2
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    /* Restore Neon registers */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+    BUFFER          .req x1
+    PUT_BUFFER      .req x6
+    PUT_BITS        .req x7
+    PUT_BITSw       .req w7
+
+.macro emit_byte
+    sub             PUT_BITS, PUT_BITS, #0x8
+    lsr             x19, PUT_BUFFER, PUT_BITS
+    uxtb            w19, w19
+    strb            w19, [BUFFER, #1]!
+    cmp             w19, #0xff
+    b.ne            14f
+    strb            wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
+    add             PUT_BITS, PUT_BITS, \SIZE
+    orr             PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+    cmp             PUT_BITS, #0x20
+    b.lt            31f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+31:
+.endm
+.macro checkbuf47
+    cmp             PUT_BITS, #0x30
+    b.lt            47f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+    sub             sp, sp, 272
+    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
+    /* Save Arm registers */
+    stp             x19, x20, [sp]
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
+    ldr             PUT_BUFFER, [x0, #0x10]
+    ldr             PUT_BITSw, [x0, #0x18]
+    ldrsh           w12, [x2]               /* load DC coeff in w12 */
+    /* prepare data */
+.if \fast_tbl == 1
+    ld1             {v23.16b}, [x15], #16
+    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
+    /* ZigZag 8x8 */
+    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+    ins             v0.h[0], w12
+    tbx             v1.16b, {v28.16b}, v16.16b
+    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
+    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
+    tbx             v6.16b, {v31.16b}, v19.16b
+.else
+      add             x13, x2, #0x22
+      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
+    ld1             {v23.16b}, [x15]
+      add             x14, x2, #0x18
+      add             x3, x2, #0x36
+    ins             v0.h[0], w12
+      add             x9, x2, #0x2
+    ld1             {v1.h}[0], [x13]
+      add             x15, x2, #0x30
+    ld1             {v2.h}[0], [x14]
+      add             x19, x2, #0x26
+    ld1             {v3.h}[0], [x3]
+      add             x20, x2, #0x28
+    ld1             {v0.h}[1], [x9]
+      add             x12, x2, #0x10
+    ld1             {v1.h}[1], [x15]
+      add             x13, x2, #0x40
+    ld1             {v2.h}[1], [x19]
+      add             x14, x2, #0x34
+    ld1             {v3.h}[1], [x20]
+      add             x3, x2, #0x1a
+    ld1             {v0.h}[2], [x12]
+      add             x9, x2, #0x20
+    ld1             {v1.h}[2], [x13]
+      add             x15, x2, #0x32
+    ld1             {v2.h}[2], [x14]
+      add             x19, x2, #0x42
+    ld1             {v3.h}[2], [x3]
+      add             x20, x2, #0xc
+    ld1             {v0.h}[3], [x9]
+      add             x12, x2, #0x12
+    ld1             {v1.h}[3], [x15]
+      add             x13, x2, #0x24
+    ld1             {v2.h}[3], [x19]
+      add             x14, x2, #0x50
+    ld1             {v3.h}[3], [x20]
+      add             x3, x2, #0xe
+    ld1             {v0.h}[4], [x12]
+      add             x9, x2, #0x4
+    ld1             {v1.h}[4], [x13]
+      add             x15, x2, #0x16
+    ld1             {v2.h}[4], [x14]
+      add             x19, x2, #0x60
+    ld1             {v3.h}[4], [x3]
+      add             x20, x2, #0x1c
+    ld1             {v0.h}[5], [x9]
+      add             x12, x2, #0x6
+    ld1             {v1.h}[5], [x15]
+      add             x13, x2, #0x8
+    ld1             {v2.h}[5], [x19]
+      add             x14, x2, #0x52
+    ld1             {v3.h}[5], [x20]
+      add             x3, x2, #0x2a
+    ld1             {v0.h}[6], [x12]
+      add             x9, x2, #0x14
+    ld1             {v1.h}[6], [x13]
+      add             x15, x2, #0xa
+    ld1             {v2.h}[6], [x14]
+      add             x19, x2, #0x44
+    ld1             {v3.h}[6], [x3]
+      add             x20, x2, #0x38
+    ld1             {v0.h}[7], [x9]
+      add             x12, x2, #0x46
+    ld1             {v1.h}[7], [x15]
+      add             x13, x2, #0x3a
+    ld1             {v2.h}[7], [x19]
+      add             x14, x2, #0x74
+    ld1             {v3.h}[7], [x20]
+      add             x3, x2, #0x6a
+    ld1             {v4.h}[0], [x12]
+      add             x9, x2, #0x54
+    ld1             {v5.h}[0], [x13]
+      add             x15, x2, #0x2c
+    ld1             {v6.h}[0], [x14]
+      add             x19, x2, #0x76
+    ld1             {v7.h}[0], [x3]
+      add             x20, x2, #0x78
+    ld1             {v4.h}[1], [x9]
+      add             x12, x2, #0x62
+    ld1             {v5.h}[1], [x15]
+      add             x13, x2, #0x1e
+    ld1             {v6.h}[1], [x19]
+      add             x14, x2, #0x68
+    ld1             {v7.h}[1], [x20]
+      add             x3, x2, #0x7a
+    ld1             {v4.h}[2], [x12]
+      add             x9, x2, #0x70
+    ld1             {v5.h}[2], [x13]
+      add             x15, x2, #0x2e
+    ld1             {v6.h}[2], [x14]
+      add             x19, x2, #0x5a
+    ld1             {v7.h}[2], [x3]
+      add             x20, x2, #0x6c
+    ld1             {v4.h}[3], [x9]
+      add             x12, x2, #0x72
+    ld1             {v5.h}[3], [x15]
+      add             x13, x2, #0x3c
+    ld1             {v6.h}[3], [x19]
+      add             x14, x2, #0x4c
+    ld1             {v7.h}[3], [x20]
+      add             x3, x2, #0x5e
+    ld1             {v4.h}[4], [x12]
+      add             x9, x2, #0x64
+    ld1             {v5.h}[4], [x13]
+      add             x15, x2, #0x4a
+    ld1             {v6.h}[4], [x14]
+      add             x19, x2, #0x3e
+    ld1             {v7.h}[4], [x3]
+      add             x20, x2, #0x6e
+    ld1             {v4.h}[5], [x9]
+      add             x12, x2, #0x56
+    ld1             {v5.h}[5], [x15]
+      add             x13, x2, #0x58
+    ld1             {v6.h}[5], [x19]
+      add             x14, x2, #0x4e
+    ld1             {v7.h}[5], [x20]
+      add             x3, x2, #0x7c
+    ld1             {v4.h}[6], [x12]
+      add             x9, x2, #0x48
+    ld1             {v5.h}[6], [x13]
+      add             x15, x2, #0x66
+    ld1             {v6.h}[6], [x14]
+      add             x19, x2, #0x5c
+    ld1             {v7.h}[6], [x3]
+      add             x20, x2, #0x7e
+    ld1             {v4.h}[7], [x9]
+    ld1             {v5.h}[7], [x15]
+    ld1             {v6.h}[7], [x19]
+    ld1             {v7.h}[7], [x20]
+.endif
+    cmlt            v24.8h, v0.8h, #0
+    cmlt            v25.8h, v1.8h, #0
+    cmlt            v26.8h, v2.8h, #0
+    cmlt            v27.8h, v3.8h, #0
+    cmlt            v28.8h, v4.8h, #0
+    cmlt            v29.8h, v5.8h, #0
+    cmlt            v30.8h, v6.8h, #0
+    cmlt            v31.8h, v7.8h, #0
+    abs             v0.8h, v0.8h
+    abs             v1.8h, v1.8h
+    abs             v2.8h, v2.8h
+    abs             v3.8h, v3.8h
+    abs             v4.8h, v4.8h
+    abs             v5.8h, v5.8h
+    abs             v6.8h, v6.8h
+    abs             v7.8h, v7.8h
+    eor             v24.16b, v24.16b, v0.16b
+    eor             v25.16b, v25.16b, v1.16b
+    eor             v26.16b, v26.16b, v2.16b
+    eor             v27.16b, v27.16b, v3.16b
+    eor             v28.16b, v28.16b, v4.16b
+    eor             v29.16b, v29.16b, v5.16b
+    eor             v30.16b, v30.16b, v6.16b
+    eor             v31.16b, v31.16b, v7.16b
+    cmeq            v16.8h, v0.8h, #0
+    cmeq            v17.8h, v1.8h, #0
+    cmeq            v18.8h, v2.8h, #0
+    cmeq            v19.8h, v3.8h, #0
+    cmeq            v20.8h, v4.8h, #0
+    cmeq            v21.8h, v5.8h, #0
+    cmeq            v22.8h, v6.8h, #0
+    xtn             v16.8b, v16.8h
+    xtn             v18.8b, v18.8h
+    xtn             v20.8b, v20.8h
+    xtn             v22.8b, v22.8h
+      umov            w14, v0.h[0]
+    xtn2            v16.16b, v17.8h
+      umov            w13, v24.h[0]
+    xtn2            v18.16b, v19.8h
+      clz             w14, w14
+    xtn2            v20.16b, v21.8h
+      lsl             w13, w13, w14
+    cmeq            v17.8h, v7.8h, #0
+      sub             w12, w14, #32
+    xtn2            v22.16b, v17.8h
+      lsr             w13, w13, w14
+    and             v16.16b, v16.16b, v23.16b
+      neg             w12, w12
+    and             v18.16b, v18.16b, v23.16b
+      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
+    and             v20.16b, v20.16b, v23.16b
+      add             x15, sp, #0x90           /* x15 = t2 */
+    and             v22.16b, v22.16b, v23.16b
+      ldr             w10, [x4, x12, lsl #2]
+    addp            v16.16b, v16.16b, v18.16b
+      ldrb            w11, [x3, x12]
+    addp            v20.16b, v20.16b, v22.16b
+      checkbuf47
+    addp            v16.16b, v16.16b, v20.16b
+      put_bits        x10, x11
+    addp            v16.16b, v16.16b, v18.16b
+      checkbuf47
+    umov            x9, v16.D[0]
+      put_bits        x13, x12
+    cnt             v17.8b, v16.8b
+      mvn             x9, x9
+    addv            B18, v17.8b
+      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
+    umov            w12, v18.b[0]
+      lsr             x9, x9, #0x1     /* clear AC coeff */
+    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
+    rbit            x9, x9             /* x9 = index0 */
+    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
+    cmp             w12, #(64-8)
+    add             x11, sp, #16
+    b.lt            4f
+    cbz             x9, 6f
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w20, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    clz             w20, w20
+    ldrh            w3, [x15, #2]!
+    sub             w11, w20, #32
+    lsl             w3, w3, w20
+    neg             w11, w11
+    lsr             w3, w3, w20
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+    b               6f
+4:
+    movi            v21.8h, #0x0010
+    clz             v0.8h, v0.8h
+    clz             v1.8h, v1.8h
+    clz             v2.8h, v2.8h
+    clz             v3.8h, v3.8h
+    clz             v4.8h, v4.8h
+    clz             v5.8h, v5.8h
+    clz             v6.8h, v6.8h
+    clz             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    neg             v0.8h, v0.8h
+    neg             v1.8h, v1.8h
+    neg             v2.8h, v2.8h
+    neg             v3.8h, v3.8h
+    neg             v4.8h, v4.8h
+    neg             v5.8h, v5.8h
+    neg             v6.8h, v6.8h
+    neg             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    add             v0.8h, v21.8h, v0.8h
+    add             v1.8h, v21.8h, v1.8h
+    add             v2.8h, v21.8h, v2.8h
+    add             v3.8h, v21.8h, v3.8h
+    add             v4.8h, v21.8h, v4.8h
+    add             v5.8h, v21.8h, v5.8h
+    add             v6.8h, v21.8h, v6.8h
+    add             v7.8h, v21.8h, v7.8h
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w11, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    ldrh            w3, [x15, #2]!
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+6:
+    add             x13, sp, #0x10e
+    cmp             x15, x13
+    b.hs            1f
+    ldr             w12, [x5]
+    ldrb            w14, [x4]
+    checkbuf47
+    put_bits        x12, x14
+1:
+    str             PUT_BUFFER, [x0, #0x10]
+    str             PUT_BITSw, [x0, #0x18]
+    ldp             x19, x20, [sp], 16
+    add             x0, BUFFER, #0x1
+    add             sp, sp, 256
+    br              x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+    .unreq          BUFFER
+    .unreq          PUT_BUFFER
+    .unreq          PUT_BITS
+    .unreq          PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    SPLAT(); \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..51db3c5f39
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,623 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *values, size_t *zerobits)
+{
+  UJCOEF *values_ptr = values;
+  UJCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs);
+    vst1q_u16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_u16(values_ptr, vdupq_n_u16(0));
+    vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+  uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+  uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+  uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+  uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+  uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+  uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+  uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  UJCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+  uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+  uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+  uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+  uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+  uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+  uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+  uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..28dbc57243
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..18fb9d8a55
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,144 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..d25112ef7f
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,801 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h b/media/libjpeg/simd/arm/neon-compat.h
new file mode 100644
index 0000000000..2907634e26
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [ebx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [edx], ymm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    add         ebx, byte SIZEOF_YMMWORD           ; outptr1
+    add         edx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                           int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
+    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
+    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    movq        mm7, mm1
+    movq        mm4, mm6
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        mm1, mm1
+    pxor        mm6, mm6
+    punpcklwd   mm1, mm5                ; mm1=BOL
+    punpckhwd   mm6, mm5                ; mm6=BOH
+    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
+
+    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm1
+    paddd       mm4, mm6
+    paddd       mm7, mm5
+    paddd       mm4, mm5
+    psrld       mm7, SCALEBITS          ; mm7=CbOL
+    psrld       mm4, SCALEBITS          ; mm4=CbOH
+    packssdw    mm7, mm4                ; mm7=CbO
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    movq        mm5, mm0
+    movq        mm4, mm6
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        mm0, mm0
+    pxor        mm6, mm6
+    punpcklwd   mm0, mm1                ; mm0=BEL
+    punpckhwd   mm6, mm1                ; mm6=BEH
+    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm5, mm0
+    paddd       mm4, mm6
+    paddd       mm5, mm1
+    paddd       mm4, mm1
+    psrld       mm5, SCALEBITS          ; mm5=CbEL
+    psrld       mm4, SCALEBITS          ; mm4=CbEH
+    packssdw    mm5, mm4                ; mm5=CbE
+
+    psllw       mm7, BYTE_BIT
+    por         mm5, mm7                ; mm5=Cb
+    movq        MMWORD [ebx], mm5       ; Save Cb
+
+    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
+    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
+    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    movq        mm7, mm0
+    movq        mm5, mm4
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, MMWORD [wk(4)]
+    paddd       mm4, MMWORD [wk(5)]
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    pxor        mm3, mm3
+    pxor        mm4, mm4
+    punpcklwd   mm3, mm1                ; mm3=ROL
+    punpckhwd   mm4, mm1                ; mm4=ROH
+    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm3
+    paddd       mm5, mm4
+    paddd       mm7, mm1
+    paddd       mm5, mm1
+    psrld       mm7, SCALEBITS          ; mm7=CrOL
+    psrld       mm5, SCALEBITS          ; mm5=CrOH
+    packssdw    mm7, mm5                ; mm7=CrO
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    movq        mm1, mm6
+    movq        mm5, mm4
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(6)]
+    paddd       mm4, MMWORD [wk(7)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    pxor        mm2, mm2
+    pxor        mm4, mm4
+    punpcklwd   mm2, mm3                ; mm2=REL
+    punpckhwd   mm4, mm3                ; mm4=REH
+    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
+
+    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
+
+    paddd       mm1, mm2
+    paddd       mm5, mm4
+    paddd       mm1, mm0
+    paddd       mm5, mm0
+    psrld       mm1, SCALEBITS          ; mm1=CrEL
+    psrld       mm5, SCALEBITS          ; mm5=CrEH
+    packssdw    mm1, mm5                ; mm1=CrE
+
+    psllw       mm7, BYTE_BIT
+    por         mm1, mm7                ; mm1=Cr
+    movq        MMWORD [edx], mm1       ; Save Cr
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    add         ebx, byte SIZEOF_MMWORD                ; outptr1
+    add         edx, byte SIZEOF_MMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [ebx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]  ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [edx], xmm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    add         ebx, byte SIZEOF_XMMWORD                ; outptr1
+    add         edx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337  times 2 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        mm0, mm5                ; mm0=BO
+    movq        mm6, mm4                ; mm6=BE
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, mm1
+    paddd       mm4, mm7
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(0)]
+    paddd       mm4, MMWORD [wk(1)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/media/libjpeg/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+    alignz      32
+
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
+%endif
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
+
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
+
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
+
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    mov         T1, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         ecx, 16
+    shl         esi, 16
+
+    or          eax, ecx
+    or          edx, esi
+
+    not         eax
+    not         edx
+
+    mov         edi, ZEROBITS
+
+    mov         INT [edi], eax
+    mov         INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO    xmm7
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define LEN     ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 4
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         ZEROBITS, T0
+    mov         LEN, INT [eax + 16]
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    and         K, -16
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    mov         LENEND, LEN
+    and         LENEND, 7
+
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO    xmm7
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T0w      cx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define KK      ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+%define EOB       INT [esp + 5 * 4 + 4]
+%define LEN       INT [esp + 5 * 4 + 8]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 16
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         K,  INT [eax + 16]
+    mov         INT [T0 + 2 * SIZEOF_INT], -1
+    mov         INT [T0 + 3 * SIZEOF_INT], -1
+    mov         ZEROBITS, T0
+    mov         LEN, K
+    pxor        ZERO, ZERO
+    and         K, -16
+    mov         EOB, 0
+    xor         KK, KK
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 2
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    mov         LENEND, LEN
+
+    test        LENEND, 8
+    jz          .TRYR7
+    test        LENEND, 7
+    jz          .TRYR8
+
+    and         LENEND, 7
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    and         LENEND, 7
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    mov         eax, EOB
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    ; ecx can possibly be 8, 16, 24
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mm2, mm0
+    movq        mm3, mm1
+
+    pand        mm0, mm6
+    psrlw       mm2, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm3, BYTE_BIT
+
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+    psrlw       mm0, 1
+    psrlw       mm1, 1
+
+    packuswb    mm0, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         short .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pand        mm0, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    pand        mm2, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm3, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm2, mm4
+    paddw       mm3, mm5
+
+    paddw       mm0, mm1
+    paddw       mm2, mm3
+    paddw       mm0, mm7
+    paddw       mm2, mm7
+    psrlw       mm0, 2
+    psrlw       mm2, 2
+
+    packuswb    mm0, mm2
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         near .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         short .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [edx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2                     ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3                     ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6                     ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7                     ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [GOTOFF(eax,PW_MF0228)]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [GOTOFF(eax,PW_F0402)]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm4, ymm4, 1                        ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                        ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm0, ymm0, 1                        ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                        ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2                     ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3                     ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6                     ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7                     ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4                ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5                ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [esi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                           JDIMENSION input_row, JSAMPARRAY output_buf,
+;                           int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+    movq        mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+    pcmpeqw     mm4, mm4
+    pcmpeqw     mm7, mm7
+    psrlw       mm4, BYTE_BIT
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+    movq        mm0, mm4                ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        mm4, mm5                ; mm4=Cb(0246)=CbE
+    psrlw       mm5, BYTE_BIT           ; mm5=Cb(1357)=CbO
+    pand        mm0, mm1                ; mm0=Cr(0246)=CrE
+    psrlw       mm1, BYTE_BIT           ; mm1=Cr(1357)=CrO
+
+    paddw       mm4, mm7
+    paddw       mm5, mm7
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm2, mm4                ; mm2=CbE
+    movq        mm3, mm5                ; mm3=CbO
+    paddw       mm4, mm4                ; mm4=2*CbE
+    paddw       mm5, mm5                ; mm5=2*CbO
+    movq        mm6, mm0                ; mm6=CrE
+    movq        mm7, mm1                ; mm7=CrO
+    paddw       mm0, mm0                ; mm0=2*CrE
+    paddw       mm1, mm1                ; mm1=2*CrO
+
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbE * -FIX(0.22800))
+    pmulhw      mm5, [GOTOFF(eax,PW_MF0228)]  ; mm5=(2*CbO * -FIX(0.22800))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrE * FIX(0.40200))
+    pmulhw      mm1, [GOTOFF(eax,PW_F0402)]   ; mm1=(2*CrO * FIX(0.40200))
+
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    paddw       mm5, [GOTOFF(eax,PW_ONE)]
+    psraw       mm4, 1                  ; mm4=(CbE * -FIX(0.22800))
+    psraw       mm5, 1                  ; mm5=(CbO * -FIX(0.22800))
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    paddw       mm1, [GOTOFF(eax,PW_ONE)]
+    psraw       mm0, 1                  ; mm0=(CrE * FIX(0.40200))
+    psraw       mm1, 1                  ; mm1=(CrO * FIX(0.40200))
+
+    paddw       mm4, mm2
+    paddw       mm5, mm3
+    paddw       mm4, mm2                ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       mm5, mm3                ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       mm0, mm6                ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       mm1, mm7                ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+    movq        MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    punpcklwd   mm2, mm6
+    punpckhwd   mm4, mm6
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm3, mm7
+    punpckhwd   mm5, mm7
+    pmaddwd     mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm4, SCALEBITS
+    paddd       mm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm3, SCALEBITS
+    psrad       mm5, SCALEBITS
+
+    packssdw    mm2, mm4                ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    mm3, mm5                ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       mm2, mm6                ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       mm3, mm7                ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movq        mm5, MMWORD [esi]       ; mm5=Y(01234567)
+
+    pcmpeqw     mm4, mm4
+    psrlw       mm4, BYTE_BIT           ; mm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm4, mm5                ; mm4=Y(0246)=YE
+    psrlw       mm5, BYTE_BIT           ; mm5=Y(1357)=YO
+
+    paddw       mm0, mm4                ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm5                ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm4                ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm5                ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4,  MMWORD [wk(0)]    ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+    paddw       mm5,  MMWORD [wk(1)]    ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .nextrow
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .nextrow
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [GOTOFF(eax,PW_MF0228)]  ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [GOTOFF(eax,PW_F0402)]   ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm5, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm1, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [esi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
+    movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
+
+    pxor        mm1, mm1                ; mm1=(all 0's)
+    pcmpeqw     mm3, mm3
+    psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    movq        mm4, mm6
+    punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
+    punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
+    movq        mm0, mm7
+    punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
+    punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
+
+    paddw       mm6, mm3
+    paddw       mm4, mm3
+    paddw       mm7, mm3
+    paddw       mm0, mm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm5, mm6                ; mm5=CbH
+    movq        mm2, mm4                ; mm2=CbL
+    paddw       mm6, mm6                ; mm6=2*CbH
+    paddw       mm4, mm4                ; mm4=2*CbL
+    movq        mm1, mm7                ; mm1=CrH
+    movq        mm3, mm0                ; mm3=CrL
+    paddw       mm7, mm7                ; mm7=2*CrH
+    paddw       mm0, mm0                ; mm0=2*CrL
+
+    pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
+    pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
+
+    paddw       mm6, [GOTOFF(eax,PW_ONE)]
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
+    psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
+    paddw       mm7, [GOTOFF(eax,PW_ONE)]
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
+    psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
+
+    paddw       mm6, mm5
+    paddw       mm4, mm2
+    paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+    movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+    movq        mm6, mm5
+    movq        mm7, mm2
+    punpcklwd   mm5, mm1
+    punpckhwd   mm6, mm1
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm2, mm3
+    punpckhwd   mm7, mm3
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm5, SCALEBITS
+    psrad       mm6, SCALEBITS
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm7, SCALEBITS
+
+    packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+    movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+    movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+    pcmpeqw     mm6, mm6
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm6, mm7                ; mm6=Y(0246)=YE
+    psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
+
+    movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
+    movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
+    movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
+
+    paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+    paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          near .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .endcolumn
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .endcolumn
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, JDIMENSION [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [GOTOFF(eax,PW_MF0228)]  ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [GOTOFF(eax,PW_F0402)]   ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    vpcmpeqb    xmm6, xmm6, xmm6
+    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
+    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+
+    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpcmpeqb    xmm1, xmm1, xmm1
+    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm3, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm3, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm0, ymm1, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm1, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm1, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm1, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm7, ymm1, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm1, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm1, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm1, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD    ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr
+    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE   times 4 dw 1
+PW_TWO   times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        mm0, mm0                ; mm0=(all 0's)
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     mm6, mm6
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, mm1
+    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
+    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
+    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
+
+    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
+    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
+
+    movq        mm7, mm1
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+    movq        mm4, mm1
+    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
+    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
+    movq        mm5, mm2
+    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
+    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
+    movq        mm6, mm3
+    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
+    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
+
+    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       mm2, mm1
+    paddw       mm5, mm4
+    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
+    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
+    paddw       mm3, mm1
+    paddw       mm6, mm4
+    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
+    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
+
+    psllw       mm3, BYTE_BIT
+    psllw       mm6, BYTE_BIT
+    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
+    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
+    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+    pand        mm1, mm7                ; mm1=( 0 - - -)
+    pand        mm2, mm7                ; mm2=( 0 - - -)
+
+    movq        MMWORD [wk(0)], mm1
+    movq        MMWORD [wk(1)], mm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     mm1, mm1
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+    movq        mm2, mm1
+
+    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
+    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
+    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+.upsample:
+    ; -- process the upper row
+
+    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
+    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
+
+    movq        mm0, mm7
+    movq        mm4, mm3
+    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+    movq        mm5, mm7
+    movq        mm6, mm3
+    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+    por         mm0, mm4                         ; mm0=( 1 2 3 4)
+    por         mm5, mm6                         ; mm5=( 3 4 5 6)
+
+    movq        mm1, mm7
+    movq        mm2, mm3
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+    movq        mm4, mm3
+    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
+    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
+
+    movq        MMWORD [wk(0)], mm4
+
+    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm7
+    paddw       mm5, mm3
+    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
+    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
+    paddw       mm0, mm7
+    paddw       mm2, mm3
+    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
+    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
+
+    psllw       mm0, BYTE_BIT
+    psllw       mm2, BYTE_BIT
+    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+    ; -- process the lower row
+
+    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
+    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
+
+    movq        mm7, mm6
+    movq        mm3, mm4
+    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+    movq        mm0, mm6
+    movq        mm2, mm4
+    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+    por         mm7, mm3                         ; mm7=( 1 2 3 4)
+    por         mm0, mm2                         ; mm0=( 3 4 5 6)
+
+    movq        mm1, mm6
+    movq        mm5, mm4
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+    movq        mm3, mm4
+    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
+    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
+
+    movq        MMWORD [wk(1)], mm3
+
+    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm6
+    paddw       mm0, mm4
+    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
+    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
+    paddw       mm7, mm6
+    paddw       mm5, mm4
+    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
+    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
+
+    psllw       mm7, BYTE_BIT
+    psllw       mm5, BYTE_BIT
+    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       xmm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
+    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
+    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
+    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
+    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
+    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
+    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
+    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
+    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+    femms                               ; empty MMX/3DNow! state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, byte 4*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [GOTOFF(ebx,PW_F0707)]  ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z3
+
+    movdqa      xmm4, xmm2                    ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F0382)]  ; xmm2=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1306)]  ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2                    ; xmm4=z2
+    paddw       xmm0, xmm2                    ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F0707)]  ; xmm0=z3
+
+    movdqa      xmm4, xmm7                    ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [GOTOFF(ebx,PW_F0382)]  ; xmm7=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1306)]  ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7                    ; xmm4=z2
+    paddw       xmm1, xmm7                    ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(ebp)]  ; (DCTELEM *)
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    psllw       mm5, PASS1_BITS         ; mm5=data0
+    psllw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm1, DESCALE_P1
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm2, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm3, DESCALE_P1
+    psrad       mm5, DESCALE_P1
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       mm5, PASS1_BITS         ; mm5=data0
+    psraw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm6, DESCALE_P2
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm2, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm3, DESCALE_P2
+    psrad       mm5, DESCALE_P2
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F130_F054)]   ; xmm7=data2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=data2H
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm4=data6L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm0=data6H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3H
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F117_F078)]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=data2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F130_F054)]   ; xmm2=data2H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=data6L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm6=data6H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm1=z3H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F117_F078)]   ; xmm2=z4L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd 1.414213562373095048801689
+PD_1_847        times 2 dd 1.847759065022573512256366
+PD_1_082        times 2 dd 1.082392200292393968799446
+PD_2_613        times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/2                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    pushpic     ebx                     ; save GOT address
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    or          eax, ebx
+    poppic      ebx                     ; restore GOT address
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0
+    punpckhdq   mm1, mm1
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    punpcklwd   mm1, mm1
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+    pi2fd       mm1, mm1
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm5, mm5
+    punpcklwd   mm1, mm1
+    psrad       mm5, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm5, mm5
+    pi2fd       mm1, mm1
+
+    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 01)
+    pfadd       mm7, mm3                ; mm7=data1=(10 11)
+    pfsub       mm5, mm1                ; mm5=data7=(70 71)
+    pfsub       mm0, mm3                ; mm0=data6=(60 61)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, mm6                ; transpose coefficients
+    punpckldq   mm6, mm7                ; mm6=(00 10)
+    punpckhdq   mm1, mm7                ; mm1=(01 11)
+    movq        mm3, mm0                ; transpose coefficients
+    punpckldq   mm0, mm5                ; mm0=(60 70)
+    punpckhdq   mm3, mm5                ; mm3=(61 71)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm6, mm7
+    movq        mm1, mm5
+    pfadd       mm7, mm2                ; mm7=data2=(20 21)
+    pfadd       mm5, mm4                ; mm5=data4=(40 41)
+    pfsub       mm6, mm2                ; mm6=data5=(50 51)
+    pfsub       mm1, mm4                ; mm1=data3=(30 31)
+
+    movq        mm0, mm7                ; transpose coefficients
+    punpckldq   mm7, mm1                ; mm7=(20 30)
+    punpckhdq   mm0, mm1                ; mm0=(21 31)
+    movq        mm3, mm5                ; transpose coefficients
+    punpckldq   mm5, mm6                ; mm5=(40 50)
+    punpckhdq   mm3, mm6                ; mm3=(41 51)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/2                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 10)
+    pfadd       mm7, mm3                ; mm7=data1=(01 11)
+    pfsub       mm5, mm1                ; mm5=data7=(07 17)
+    pfsub       mm0, mm3                ; mm0=data6=(06 16)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm3, mm3
+    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
+    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
+    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
+    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+    pand        mm6, mm3                ; mm6=(00 -- 10 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
+    pand        mm0, mm3                ; mm0=(06 -- 16 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
+    por         mm6, mm7                ; mm6=(00 01 10 11)
+    por         mm0, mm5                ; mm0=(06 07 16 17)
+
+    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
+    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm7, mm1
+    movq        mm5, mm3
+    pfadd       mm1, mm2                ; mm1=data2=(02 12)
+    pfadd       mm3, mm4                ; mm3=data4=(04 14)
+    pfsub       mm7, mm2                ; mm7=data5=(05 15)
+    pfsub       mm5, mm4                ; mm5=data3=(03 13)
+
+    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm4, mm4
+    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
+    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
+    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
+    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+    pand        mm3, mm4                ; mm3=(04 -- 14 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
+    pand        mm1, mm4                ; mm1=(02 -- 12 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
+    por         mm3, mm7                ; mm3=(04 05 14 15)
+    por         mm1, mm5                ; mm1=(02 03 12 13)
+
+    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
+
+    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
+    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
+    paddb       mm6, mm2
+    paddb       mm1, mm2
+
+    movq        mm4, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 2*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_0_125       times 4 dd  0.125        ; 1/8
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
+    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
+    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
+
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
+
+    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
+    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
+    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
+    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
+    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
+    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
+    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
+    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
+
+    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
+    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
+    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
+    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
+    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
+    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
+    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
+    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
+
+    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
+    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
+    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
+    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
+
+    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
+    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
+
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
+    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
+    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
+    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
+
+    mulps       xmm6, xmm1              ; descale(1/8)
+    mulps       xmm7, xmm1              ; descale(1/8)
+    mulps       xmm5, xmm1              ; descale(1/8)
+    mulps       xmm0, xmm1              ; descale(1/8)
+
+    movhlps     xmm3, xmm6
+    movhlps     xmm1, xmm7
+    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
+    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
+    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
+    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
+
+    movhlps     xmm6, xmm5
+    movhlps     xmm7, xmm0
+    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
+    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
+    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
+    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
+    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
+    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
+
+    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
+    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
+
+    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm5, xmm3
+    movaps      xmm0, xmm1
+    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
+    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
+    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
+
+    mulps       xmm3, xmm6              ; descale(1/8)
+    mulps       xmm1, xmm6              ; descale(1/8)
+    mulps       xmm5, xmm6              ; descale(1/8)
+    mulps       xmm0, xmm6              ; descale(1/8)
+
+    movhlps     xmm7, xmm3
+    movhlps     xmm2, xmm1
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
+    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
+    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
+    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
+    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
+
+    movhlps     xmm4, xmm5
+    movhlps     xmm6, xmm0
+    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
+    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
+    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
+    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
+    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
+    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm6
+    paddb       mm1, mm6
+    paddb       mm2, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
+    movq        mm3, mm2                ; transpose coefficients(phase 1)
+    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm6, mm3                ; transpose coefficients(phase 2)
+    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
+
+    movq        mm1, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414       times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 01 02 03)
+    paddw       mm7, mm0                ; mm7=data1=(10 11 12 13)
+    psubw       mm1, mm3                ; mm1=data7=(70 71 72 73)
+    psubw       mm5, mm0                ; mm5=data6=(60 61 62 63)
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    movq        mm3, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm3, mm7                ; mm3=(02 12 03 13)
+    movq        mm0, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm1                ; mm5=(60 70 61 71)
+    punpckhwd   mm0, mm1                ; mm0=(62 72 63 73)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm7
+    movq        mm0, mm1
+    paddw       mm7, mm4                ; mm7=data2=(20 21 22 23)
+    paddw       mm1, mm2                ; mm1=data4=(40 41 42 43)
+    psubw       mm5, mm4                ; mm5=data5=(50 51 52 53)
+    psubw       mm0, mm2                ; mm0=data3=(30 31 32 33)
+
+    movq        mm4, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm0                ; mm7=(20 30 21 31)
+    punpckhwd   mm4, mm0                ; mm4=(22 32 23 33)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm5                ; mm1=(40 50 41 51)
+    punpckhwd   mm2, mm5                ; mm2=(42 52 43 53)
+
+    movq        mm0, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(00 10 20 30)
+    punpckhdq   mm0, mm7                ; mm0=(01 11 21 31)
+    movq        mm5, mm3                ; transpose coefficients(phase 2)
+    punpckldq   mm3, mm4                ; mm3=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+    movq        mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm7                ; mm1=(40 50 60 70)
+    punpckhdq   mm6, mm7                ; mm6=(41 51 61 71)
+    movq        mm0, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(42 52 62 72)
+    punpckhdq   mm0, mm4                ; mm0=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_IFAST_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 10 20 30)
+    paddw       mm7, mm0                ; mm7=data1=(01 11 21 31)
+    psraw       mm6, (PASS1_BITS+3)     ; descale
+    psraw       mm7, (PASS1_BITS+3)     ; descale
+    psubw       mm1, mm3                ; mm1=data7=(07 17 27 37)
+    psubw       mm5, mm0                ; mm5=data6=(06 16 26 36)
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    packsswb    mm6, mm5                ; mm6=(00 10 20 30 06 16 26 36)
+    packsswb    mm7, mm1                ; mm7=(01 11 21 31 07 17 27 37)
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=tmp2
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm3
+    movq        mm1, mm0
+    paddw       mm3, mm4                ; mm3=data2=(02 12 22 32)
+    paddw       mm0, mm2                ; mm0=data4=(04 14 24 34)
+    psraw       mm3, (PASS1_BITS+3)     ; descale
+    psraw       mm0, (PASS1_BITS+3)     ; descale
+    psubw       mm5, mm4                ; mm5=data5=(05 15 25 35)
+    psubw       mm1, mm2                ; mm1=data3=(03 13 23 33)
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+
+    movq        mm4, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm4=[PB_CENTERJSAMP]
+
+    packsswb    mm3, mm0                ; mm3=(02 12 22 32 04 14 24 34)
+    packsswb    mm1, mm5                ; mm1=(03 13 23 33 05 15 25 35)
+
+    paddb       mm6, mm4
+    paddb       mm7, mm4
+    paddb       mm3, mm4
+    paddb       mm1, mm4
+
+    movq        mm2, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm7                ; mm6=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm7                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm0, mm3                ; transpose coefficients(phase 1)
+    punpcklbw   mm3, mm1                ; mm3=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm0, mm1                ; mm0=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm3                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm3                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm4, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm4, mm2                ; mm4=(24 25 26 27 34 35 36 37)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13 14 15 16 17)
+    movq        mm1, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1414)]  ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1847)]   ; xmm5=z5
+    pmulhw      xmm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1414)]  ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1847)]   ; xmm4=z5
+    pmulhw      xmm6, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %3, %8, %5
+    vmovdqu     %11, %3                 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %3, %8, %5
+    vmovdqu     %9, %3                  ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %3, %1, %6
+    vmovdqu     %12, %3                 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %3, %1, %6
+    vmovdqu     %10, %3                 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vmovdqu     %3, %9
+    vmovdqu     %4, %10
+
+    vpaddd      %1, %3, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %4, %8              ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %3, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %4, %8              ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vmovdqu     %7, %11
+    vmovdqu     %8, %12
+
+    vpaddd      %2, %7, %5              ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %3, %8, %6              ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpackssdw   %2, %2, %3              ; %2=data3_2
+
+    vpsubd      %3, %7, %5              ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %6, %8, %6              ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %6, %6, DESCALE_P %+ %13
+    vpackssdw   %3, %3, %6              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+    vpaddb      ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    mov         edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         12
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm3=[PD_DESCALE_P1]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 01 02 03)
+    packssdw    mm2, mm0                ; mm2=data7=(70 71 72 73)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm1=[PD_DESCALE_P1]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P1
+    psrad       mm3, DESCALE_P1
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm4, mm3                ; mm4=data1=(10 11 12 13)
+    packssdw    mm7, mm0                ; mm7=data6=(60 61 62 63)
+
+    movq        mm6, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm4                ; mm5=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm1, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm2                ; mm7=(60 70 61 71)
+    punpckhwd   mm1, mm2                ; mm1=(62 72 63 73)
+
+    movq        mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+    movq        mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+    movq        mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+    movq        mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+    movq        MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+    movq        MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+    movq        mm5, mm3
+    movq        mm6, mm0
+    paddd       mm3, mm4                ; mm3=data2L
+    paddd       mm0, mm2                ; mm0=data2H
+    psubd       mm5, mm4                ; mm5=data5L
+    psubd       mm6, mm2                ; mm6=data5H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm7=[PD_DESCALE_P1]
+
+    paddd       mm3, mm7
+    paddd       mm0, mm7
+    psrad       mm3, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm5, mm7
+    paddd       mm6, mm7
+    psrad       mm5, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm3, mm0                ; mm3=data2=(20 21 22 23)
+    packssdw    mm5, mm6                ; mm5=data5=(50 51 52 53)
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+    movq        mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+    movq        mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+    movq        mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+    movq        mm0, mm1
+    movq        mm6, mm4
+    paddd       mm1, mm2                ; mm1=data3L
+    paddd       mm4, mm7                ; mm4=data3H
+    psubd       mm0, mm2                ; mm0=data4L
+    psubd       mm6, mm7                ; mm6=data4H
+
+    movq        mm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm2=[PD_DESCALE_P1]
+
+    paddd       mm1, mm2
+    paddd       mm4, mm2
+    psrad       mm1, DESCALE_P1
+    psrad       mm4, DESCALE_P1
+    paddd       mm0, mm2
+    paddd       mm6, mm2
+    psrad       mm0, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm1, mm4                ; mm1=data3=(30 31 32 33)
+    packssdw    mm0, mm6                ; mm0=data4=(40 41 42 43)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+    movq        mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+    movq        mm4, mm3                ; transpose coefficients(phase 1)
+    punpcklwd   mm3, mm1                ; mm3=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm6, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm5                ; mm0=(40 50 41 51)
+    punpckhwd   mm6, mm5                ; mm6=(42 52 43 53)
+
+    movq        mm1, mm7                ; transpose coefficients(phase 2)
+    punpckldq   mm7, mm3                ; mm7=(00 10 20 30)
+    punpckhdq   mm1, mm3                ; mm1=(01 11 21 31)
+    movq        mm5, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+    movq        mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpckldq   mm0, mm3                ; mm0=(40 50 60 70)
+    punpckhdq   mm7, mm3                ; mm7=(41 51 61 71)
+    movq        mm1, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm4                ; mm6=(42 52 62 72)
+    punpckhdq   mm1, mm4                ; mm1=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm3=[PD_DESCALE_P2]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 10 20 30)
+    packssdw    mm2, mm0                ; mm2=data7=(07 17 27 37)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm1=[PD_DESCALE_P2]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm4, mm3                ; mm4=data1=(01 11 21 31)
+    packssdw    mm7, mm0                ; mm7=data6=(06 16 26 36)
+
+    packsswb    mm5, mm7                ; mm5=(00 10 20 30 06 16 26 36)
+    packsswb    mm4, mm2                ; mm4=(01 11 21 31 07 17 27 37)
+
+    movq        mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+    movq        mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+    movq        mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+    movq        mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+    movq        mm7, mm6
+    movq        mm2, mm1
+    paddd       mm6, mm3                ; mm6=data2L
+    paddd       mm1, mm0                ; mm1=data2H
+    psubd       mm7, mm3                ; mm7=data5L
+    psubd       mm2, mm0                ; mm2=data5H
+
+    movq        mm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm5=[PD_DESCALE_P2]
+
+    paddd       mm6, mm5
+    paddd       mm1, mm5
+    psrad       mm6, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm7, mm5
+    paddd       mm2, mm5
+    psrad       mm7, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    packssdw    mm6, mm1                ; mm6=data2=(02 12 22 32)
+    packssdw    mm7, mm2                ; mm7=data5=(05 15 25 35)
+
+    movq        mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+    movq        mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+    movq        mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+    movq        mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+    movq        mm1, mm4
+    movq        mm2, mm3
+    paddd       mm4, mm0                ; mm4=data3L
+    paddd       mm3, mm5                ; mm3=data3H
+    psubd       mm1, mm0                ; mm1=data4L
+    psubd       mm2, mm5                ; mm2=data4H
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm0=[PD_DESCALE_P2]
+
+    paddd       mm4, mm0
+    paddd       mm3, mm0
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm1, mm0
+    paddd       mm2, mm0
+    psrad       mm1, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    movq        mm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm5=[PB_CENTERJSAMP]
+
+    packssdw    mm4, mm3                ; mm4=data3=(03 13 23 33)
+    packssdw    mm1, mm2                ; mm1=data4=(04 14 24 34)
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=(00 10 20 30 06 16 26 36)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(01 11 21 31 07 17 27 37)
+
+    packsswb    mm6, mm1                ; mm6=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm7                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm5
+    paddb       mm3, mm5
+    paddb       mm6, mm5
+    paddb       mm4, mm5
+
+    movq        mm2, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm3                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm3                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm1, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm4                ; mm6=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm1, mm4                ; mm1=(04 05 14 15 24 25 34 35)
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm6                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm7, mm6                ; mm7=(20 21 22 23 30 31 32 33)
+    movq        mm5, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm2                ; mm1=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm5, mm2                ; mm5=(24 25 26 27 34 35 36 37)
+
+    movq        mm3, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm1                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm3, mm1                ; mm3=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm7                ; transpose coefficients(phase 3)
+    punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm2=z3L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F117_F078)]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm3=[PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm1=[PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm7=[PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm2=[PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F117_F078)]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm1=[PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm2=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm5=[PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm7=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm5=[PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
+PW_F256_F089    times 2 dw  F_2_562,  F_0_899
+PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
+PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
+PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm0, mm1
+    packsswb    mm0, mm0
+    movd        eax, mm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P1_4
+    psrad       mm2, DESCALE_P1_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
+    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P1_4
+    psrad       mm0, DESCALE_P1_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
+    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
+
+    movq        mm6, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm7, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
+    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
+    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
+    movq        mm3, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
+    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P2_4
+    psrad       mm2, DESCALE_P2_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
+    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P2_4
+    psrad       mm0, DESCALE_P2_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
+    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
+    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
+    paddb       mm1, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm1                ; transpose coefficients(phase 1)
+    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    psrlq       mm1, 4*BYTE_BIT
+    psrlq       mm0, 4*BYTE_BIT
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+    pcmpeqd     mm7, mm7
+    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+    movq        mm4, mm0                ; mm4=(10 11 ** 13)
+    movq        mm5, mm2                ; mm5=(50 51 ** 53)
+    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
+    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
+    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
+    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
+    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
+    por         mm0, mm1                ; mm0=(11 31 13 33)
+    por         mm2, mm3                ; mm2=(51 71 53 73)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
+
+    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
+    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
+    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
+    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
+    por         mm6, mm1                ; mm6=(15 35 17 37)
+    por         mm3, mm5                ; mm3=(55 75 57 77)
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
+    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
+
+    ; -- Even part
+
+    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
+    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
+    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
+
+    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
+    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
+    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
+    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
+
+    ; -- Final output stage
+
+    movq        mm3, mm1
+    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
+    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
+    punpckldq   mm1, mm3                ; mm1=(A0 B0)
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
+
+    movq        mm4, mm2
+    movq        mm3, mm5
+    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
+    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
+    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
+    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
+
+    paddd       mm1, mm7
+    psrad       mm1, DESCALE_P1_2
+
+    paddd       mm2, mm7
+    paddd       mm5, mm7
+    psrad       mm2, DESCALE_P1_2
+    psrad       mm5, DESCALE_P1_2
+    paddd       mm4, mm7
+    paddd       mm3, mm7
+    psrad       mm4, DESCALE_P1_2
+    psrad       mm3, DESCALE_P1_2
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
+    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
+
+    ; -- Even part
+
+    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
+
+    ; -- Final output stage
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
+
+    movq        mm6, mm1
+    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
+    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
+
+    paddd       mm1, mm0
+    paddd       mm6, mm0
+    psrad       mm1, DESCALE_P2_2
+    psrad       mm6, DESCALE_P2_2
+
+    movq        mm7, mm1                ; transpose coefficients
+    punpckldq   mm1, mm6                ; mm1=(C0 D0)
+    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
+
+    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
+    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    movd        ecx, mm1
+    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
+    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F256_F089)]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F256_F089)]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm6=[PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm7=[PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F256_F089)]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F106_MF217)]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; xmm1=[PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; xmm2=[PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    pi2fd       mm4, mm4
+    pi2fd       mm2, mm2
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    pi2fd       mm5, mm5
+    pi2fd       mm0, mm0
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    pi2fd       mm6, mm6
+    pi2fd       mm3, mm3
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    pi2fd       mm4, mm4
+    pi2fd       mm1, mm1
+
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
+    movd        mm7, eax
+    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
+    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
+    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
+    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
+    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
+    movq        mm5, mm2
+    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
+    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
+
+    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
+    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
+
+    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
+    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
+    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
+    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
+
+    movq        mm5, mm6
+    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
+    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
+    movq        mm1, mm3
+    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
+    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
+
+    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
+    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        mm6, mm6                ; mm6=(all 0's)
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
+    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
+
+    movq        mm4, mm0
+    punpcklbw   mm0, mm6                ; mm0=(0123)
+    punpckhbw   mm4, mm6                ; mm4=(4567)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm6                ; mm1=(89AB)
+    punpckhbw   mm5, mm6                ; mm5=(CDEF)
+
+    paddw       mm0, mm7
+    paddw       mm4, mm7
+    paddw       mm1, mm7
+    paddw       mm5, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+    movq        mm0, mm2
+    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
+    punpckhbw   mm0, mm6                ; mm0=(KLMN)
+    movq        mm4, mm3
+    punpcklbw   mm3, mm6                ; mm3=(OPQR)
+    punpckhbw   mm4, mm6                ; mm4=(STUV)
+
+    paddw       mm2, mm7
+    paddw       mm0, mm7
+    paddw       mm3, mm7
+    paddw       mm4, mm7
+
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+;                    DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         ah, 2
+    alignx      16, 7
+.quantloop1:
+    mov         al, DCTSIZE2/8/2
+    alignx      16, 7
+.quantloop2:
+    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+    movq        mm0, mm2
+    movq        mm1, mm3
+
+    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
+    psraw       mm3, (WORD_BIT-1)
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    ;
+    ; MMX is an annoyingly crappy instruction set. It has two
+    ; misfeatures that are causing problems here:
+    ;
+    ; - All multiplications are signed.
+    ;
+    ; - The second operand for the shifts is not treated as packed.
+    ;
+    ;
+    ; We work around the first problem by implementing this algorithm:
+    ;
+    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+    ; {
+    ;   enum { SHORT_BIT = 16 };
+    ;   signed short sx = (signed short)x;
+    ;   signed short sy = (signed short)y;
+    ;   signed long sz;
+    ;
+    ;   sz = (long)sx * (long)sy;    /* signed multiply */
+    ;
+    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
+    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
+    ;
+    ;   return (unsigned long)sz;
+    ; }
+    ;
+    ; (note that a negative sx adds _sy_ and vice versa)
+    ;
+    ; For the second problem, we replace the shift by a multiplication.
+    ; Unfortunately that means we have to deal with the signed issue again.
+    ;
+
+    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
+
+    movq        mm4, mm0                ; store current value for later
+    movq        mm5, mm1
+    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
+    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
+    paddw       mm1, mm5  ; so we always need to add the initial value
+                          ; (input value is never negative as we
+                          ; inverted it at the start of this routine)
+
+    ; here it gets a bit tricky as both scale
+    ; and mm0/mm1 can be negative
+    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
+    movq        mm7, MMWORD [SCALE(0,1,edx)]
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pmulhw      mm0, mm6
+    pmulhw      mm1, mm7
+
+    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
+    psraw       mm7, (WORD_BIT-1)
+
+    pand        mm6, mm4                ; and add input if it is
+    pand        mm7, mm5
+    paddw       mm0, mm6
+    paddw       mm1, mm7
+
+    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
+    psraw       mm5, (WORD_BIT-1)
+
+    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
+    pand        mm5, MMWORD [SCALE(0,1,edx)]
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+    add         esi, byte 8*SIZEOF_DCTELEM
+    add         edx, byte 8*SIZEOF_DCTELEM
+    add         edi, byte 8*SIZEOF_JCOEF
+    dec         al
+    jnz         near .quantloop2
+    dec         ah
+    jnz         near .quantloop1        ; to avoid branch misprediction
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
+    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
+    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
+    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
+
+    movlhps     xmm0, xmm1              ; xmm0=(0123)
+    movlhps     xmm2, xmm3              ; xmm2=(4567)
+    movlhps     xmm4, xmm5              ; xmm4=(89AB)
+    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                          FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    movhlps     xmm4, xmm0
+    movhlps     xmm5, xmm1
+
+    cvtps2pi    mm0, xmm0
+    cvtps2pi    mm1, xmm1
+    cvtps2pi    mm4, xmm4
+    cvtps2pi    mm5, xmm5
+
+    movhlps     xmm6, xmm2
+    movhlps     xmm7, xmm3
+
+    cvtps2pi    mm2, xmm2
+    cvtps2pi    mm3, xmm3
+    cvtps2pi    mm6, xmm6
+    cvtps2pi    mm7, xmm7
+
+    packssdw    mm0, mm4
+    packssdw    mm1, mm5
+    packssdw    mm2, mm6
+    packssdw    mm3, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, ymm0, xmm1, 1
+    vinserti128 ymm2, ymm2, xmm3, 1
+    vinserti128 ymm4, ymm4, xmm5, 1
+    vinserti128 ymm6, ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/32
+    alignx      16, 7
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 32*SIZEOF_DCTELEM
+    add         edx, byte 32*SIZEOF_DCTELEM
+    add         edi, byte 32*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c
new file mode 100644
index 0000000000..b429b0a532
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimd.c
@@ -0,0 +1,1312 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+    simd_support &= JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+    simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extrgb_ycc_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    mmxfct = jsimd_rgb_ycc_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    mmxfct = jsimd_extrgb_gray_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    mmxfct = jsimd_extrgbx_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    mmxfct = jsimd_extbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    mmxfct = jsimd_extbgrx_gray_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    mmxfct = jsimd_extxbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    mmxfct = jsimd_extxrgb_gray_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    mmxfct = jsimd_rgb_gray_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extrgb_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extbgr_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    mmxfct = jsimd_ycc_rgb_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_fdct_islow_sse2(data);
+  else
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+  else
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+    push        edi
+
+    xor         edi, edi                ; simd support flag
+
+    pushfd
+    pop         eax
+    mov         edx, eax
+    xor         eax, 1<<21              ; flip ID bit in EFLAGS
+    push        eax
+    popfd
+    pushfd
+    pop         eax
+    xor         eax, edx
+    jz          near .return            ; CPUID is not supported
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    xor         eax, eax
+    cpuid
+    test        eax, eax
+    jz          near .return
+    cmp         eax, 7
+    jl          short .no_avx2          ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         eax, 7
+    xor         ecx, ecx
+    cpuid
+    mov         eax, ebx
+    test        eax, 1<<5               ; bit5:AVX2
+    jz          short .no_avx2
+
+    ; Check for AVX2 O/S support
+    mov         eax, 1
+    xor         ecx, ecx
+    cpuid
+    test        ecx, 1<<27
+    jz          short .no_avx2          ; O/S does not support XSAVE
+    test        ecx, 1<<28
+    jz          short .no_avx2          ; CPU does not support AVX2
+
+    xor         ecx, ecx
+    xgetbv
+    and         eax, 6
+    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .no_avx2
+
+    or          edi, JSIMD_AVX2
+.no_avx2:
+
+    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    ; Check for MMX instruction support
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
+    ; Check for 3DNow! instruction support
+    mov         eax, 0x80000000
+    cpuid
+    cmp         eax, 0x80000000
+    jbe         short .return
+
+    mov         eax, 0x80000001
+    cpuid
+    mov         eax, edx                ; eax = Extended feature flags
+
+    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
+    jz          short .no_3dnow
+    or          edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+    mov         eax, edi
+
+    pop         edi
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h
new file mode 100644
index 0000000000..a28754adb9
--- /dev/null
+++ b/media/libjpeg/simd/jsimd.h
@@ -0,0 +1,1258 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2014, Linaro Limited.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE     0x00
+#define JSIMD_MMX      0x01
+#define JSIMD_3DNOW    0x02
+#define JSIMD_SSE      0x04
+#define JSIMD_SSE2     0x08
+#define JSIMD_NEON     0x10
+#define JSIMD_DSPR2    0x20
+#define JSIMD_ALTIVEC  0x40
+#define JSIMD_AVX2     0x80
+#define JSIMD_MMI      0x100
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
+
+/* RGB & extended RGB --> YCC Colorspace Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+/* RGB & extended RGB --> Grayscale Colorspace Conversion */
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+/* YCC --> RGB & extended RGB Colorspace Conversion */
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+/* NULL Colorspace Conversion */
+EXTERN(void) jsimd_c_null_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows, int num_components);
+
+/* h2v1 Downsampling */
+EXTERN(void) jsimd_h2v1_downsample_mmx
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_sse2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_neon
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_altivec
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Downsampling */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_neon
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mmi
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_altivec
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Smooth Downsampling */
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+  (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+   int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+   JDIMENSION image_width);
+
+
+/* Upsampling */
+EXTERN(void) jsimd_h2v1_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+  (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+   int max_v_samp_factor);
+
+EXTERN(void) jsimd_h2v1_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+/* Fancy Upsampling */
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+/* Merged Upsampling */
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+/* Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_sse2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_neon
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_altivec
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+/* Floating Point Sample Conversion */
+EXTERN(void) jsimd_convsamp_float_3dnow
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
+
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
+
+/* Fast Integer Forward DCT */
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
+
+/* Floating Point Forward DCT */
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
+
+/* Quantization */
+EXTERN(void) jsimd_quantize_mmx
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_sse2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_neon
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_dspr2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_mmi
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_altivec
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+/* Floating Point Quantization */
+EXTERN(void) jsimd_quantize_float_3dnow
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_dspr2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+/* Scaled Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+  (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+  (int *workspace, int *output);
+
+/* Accurate Integer Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_dspr2
+  (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_altivec
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Fast Integer Inverse DCT */
+EXTERN(void) jsimd_idct_ifast_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+  (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+   const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+  (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+   const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_altivec
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Floating Point Inverse DCT */
+EXTERN(void) jsimd_idct_float_3dnow
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Huffman coding */
+extern const int jconst_huff_encode_one_block[];
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c
new file mode 100644
index 0000000000..c6e789aa2f
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd.c
@@ -0,0 +1,1143 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+  const char *file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE *f = NULL;
+
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_DSPR2;
+        return;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           /* FIX( 1.082392200 / 2) =  17734 = 0x4546 */
+  0x5A805A80,           /* FIX( 1.414213562 / 2) =  23170 = 0x5A82 */
+  0x76407640,           /* FIX( 1.847759065 / 2) =  30274 = 0x7642 */
+  0xAC60AC60            /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+  jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+                             output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+                                     compptr->v_samp_factor,
+                                     cinfo->max_v_samp_factor,
+                                     cinfo->smoothing_factor,
+                                     compptr->width_in_blocks,
+                                     cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+  jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+                           upsample->v_expand[compptr->component_index],
+                           input_data, output_data_ptr, cinfo->output_width,
+                           cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+  jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+                       workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int workspace[96];
+  int output[12] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col),
+    (int)(output_buf[8] + output_col),
+    (int)(output_buf[9] + output_col),
+    (int)(output_buf[10] + output_col),
+    (int)(output_buf[11] + output_col)
+  };
+
+  jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+  jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int output[8] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col)
+  };
+
+  jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+                         IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE *quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+  jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+                              mips_idct_ifast_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+                              mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ *                          All Rights Reserved.
+ * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
+ *           Darko Laus       <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw          t9, 24(sp)      /* t9 = num_rows */
+    lw          s0, 28(sp)      /* s0 = cinfo->num_components */
+    andi        t0, a0, 3       /* t0 = cinfo->image_width & 3 */
+    beqz        t0, 4f          /* no residual */
+     nop
+0:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+1:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+2:
+    lbu         t3, 0(t2)
+    addiu       t5, t5, 1
+    sb          t3, -1(t5)
+    bne         t6, t5, 2b
+     addu       t2, t2, s0
+3:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 3b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 1b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 0b
+     addiu      a3, a3, 1
+    b           7f
+     nop
+4:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+5:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+6:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 6b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 5b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 4b
+     addiu      a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          t7, 48(sp)      /* t7 = num_rows */
+    li          s0, 0x4c8b      /* FIX(0.29900) */
+    li          s1, 0x9646      /* FIX(0.58700) */
+    li          s2, 0x1d2f      /* FIX(0.11400) */
+    li          s3, 0xffffd4cd  /* -FIX(0.16874) */
+    li          s4, 0xffffab33  /* -FIX(0.33126) */
+    li          s5, 0x8000      /* FIX(0.50000) */
+    li          s6, 0xffff94d1  /* -FIX(0.41869) */
+    li          s7, 0xffffeb2f  /* -FIX(0.08131) */
+    li          t8, 0x807fff    /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+    addiu       t7, -1          /* --num_rows */
+    lw          t6, 0(a1)       /* t6 = input_buf[0] */
+    lw          t0, 0(a2)
+    lw          t1, 4(a2)
+    lw          t2, 8(a2)
+    sll         t3, a3, 2
+    lwx         t0, t3(t0)      /* t0 = output_buf[0][output_row] */
+    lwx         t1, t3(t1)      /* t1 = output_buf[1][output_row] */
+    lwx         t2, t3(t2)      /* t2 = output_buf[2][output_row] */
+
+    addu        t9, t2, a0      /* t9 = end address */
+    addiu       a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo        s5, $ac0
+    mtlo        t8, $ac1
+    mtlo        t8, $ac2
+    maddu       $ac0, s2, t5
+    maddu       $ac1, s5, t5
+    maddu       $ac2, s5, t3
+    maddu       $ac0, s0, t3
+    maddu       $ac1, s3, t3
+    maddu       $ac2, s6, t4
+    maddu       $ac0, s1, t4
+    maddu       $ac1, s4, t4
+    maddu       $ac2, s7, t5
+    extr.w      t3, $ac0, 16
+    extr.w      t4, $ac1, 16
+    extr.w      t5, $ac2, 16
+    sb          t3, 0(t0)
+    sb          t4, 0(t1)
+    sb          t5, 0(t2)
+    addiu       t0, 1
+    addiu       t2, 1
+    bne         t2, t9, 1b
+     addiu      t1, 1
+    bgtz        t7, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r_offs(\outptr)
+    sb          \scratch1, \g_offs(\outptr)
+    sb          \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li          t0, 0xFF
+    sb          t0, \a_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = input_row
+ * a3     = output_buf
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s1, 48(sp)
+    li          t3, 0x8000
+    li          t4, 0x166e9     /* FIX(1.40200) */
+    li          t5, 0x1c5a2     /* FIX(1.77200) */
+    li          t6, 0xffff492e  /* -FIX(0.71414) */
+    li          t7, 0xffffa7e6  /* -FIX(0.34414) */
+    repl.ph     t8, 128
+
+0:
+    lw          s0, 0(a3)
+    lw          t0, 0(a1)
+    lw          t1, 4(a1)
+    lw          t2, 8(a1)
+    sll         s5, a2, 2
+    addiu       s1, -1
+    lwx         s2, s5(t0)
+    lwx         s3, s5(t1)
+    lwx         s4, s5(t2)
+    addu        t9, s2, a0
+    addiu       a2, 1
+
+1:
+    lbu         s7, 0(s4)       /* cr */
+    lbu         s6, 0(s3)       /* cb */
+    lbu         s5, 0(s2)       /* y */
+    addiu       s2, 1
+    addiu       s4, 1
+    addiu       s7, -128
+    addiu       s6, -128
+    mul         t2, t7, s6
+    mul         t0, t6, s7      /* Crgtab[cr] */
+    sll         s7, 15
+    mulq_rs.w   t1, t4, s7      /* Crrtab[cr] */
+    sll         s6, 15
+    addu        t2, t3          /* Cbgtab[cb] */
+    addu        t2, t0
+
+    mulq_rs.w   t0, t5, s6      /* Cbbtab[cb] */
+    sra         t2, 16
+    addu        t1, s5
+    addu        t2, s5          /* add y */
+    ins         t2, t1, 16, 16
+    subu.ph     t2, t2, t8
+    addu        t0, s5
+    shll_s.ph   t2, t2, 8
+    subu        t0, 128
+    shra.ph     t2, t2, 8
+    shll_s.w    t0, t0, 24
+    addu.ph     t2, t2, t8      /* clip & store */
+    sra         t0, t0, 24
+    sra         t1, t2, 16
+    addiu       t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne         s2, t9, 1b
+     addiu      s3, 1
+    bgtz        s1, 0b
+     addiu      a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
+                                              r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li          s0, 0x4c8b      /* s0 = FIX(0.29900) */
+    li          s1, 0x9646      /* s1 = FIX(0.58700) */
+    li          s2, 0x1d2f      /* s2 = FIX(0.11400) */
+    li          s7, 0x8000      /* s7 = FIX(0.50000) */
+    lw          s6, 48(sp)
+    andi        t7, a0, 3
+
+0:
+    addiu       s6, -1          /* s6 = num_rows */
+    lw          t0, 0(a1)
+    lw          t1, 0(a2)
+    sll         t3, a3, 2
+    lwx         t1, t3(t1)
+    addiu       a3, 1
+    addu        t9, t1, a0
+    subu        t8, t9, t7
+    beq         t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    extr.w      t2, $ac1, 16
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t5, $ac0, 16
+    sb          t6, 0(t1)
+    sb          t2, 1(t1)
+    extr.w      t3, $ac1, 16
+    addiu       t1, 4
+    sb          t5, -2(t1)
+    sb          t3, -1(t1)
+    bne         t1, t8, 1b
+     nop
+
+2:
+    beqz        t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    extr.w      t6, $ac0, 16
+    sb          t6, 0(t1)
+    addiu       t1, 1
+    bne         t1, t9, 3b
+     nop
+
+4:
+    bgtz        s6, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          \scratch0, 0xFF
+    sb          \scratch0, \a1_offs(\outptr)
+    sb          \scratch0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw          t9, 56(sp)      /* cinfo->sample_range_limit */
+    lw          v0, 0(a1)
+    lw          v1, 4(a1)
+    lw          t0, 8(a1)
+    sll         t1, a2, 3
+    addiu       t2, t1, 4
+    sll         t3, a2, 2
+    lw          t4, 0(a3)       /* t4 = output_buf[0] */
+    lwx         t1, t1(v0)      /* t1 = input_buf[0][in_row_group_ctr*2] */
+    lwx         t2, t2(v0)      /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+    lwx         t5, t3(v1)      /* t5 = input_buf[1][in_row_group_ctr] */
+    lwx         t6, t3(t0)      /* t6 = input_buf[2][in_row_group_ctr] */
+    lw          t7, 4(a3)       /* t7 = output_buf[1] */
+    li          s1, 0xe6ea
+    addiu       t8, s1, 0x7fff    /* t8 = 0x166e9 [FIX(1.40200)] */
+    addiu       s0, t8, 0x5eb9    /* s0 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s1, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s2, s1, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t3, a0, 1
+    blez        t3, 2f
+     addu       t0, t5, t3      /* t0 = end address */
+ 1:
+    lbu         t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t5, t5, 1
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    lbu         v0, 0(t1)
+    addiu       t6, t6, 1
+    addiu       t1, t1, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, -1(t1)
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, 1(t2)
+    addiu       t2, t2, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne         t0, t5, 1b
+     nop
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     lbu        t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    lbu         v0, 0(t1)
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+    sb          t0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li          t0, 0xe6ea
+    lw          t1, 0(a1)         /* t1 = input_buf[0] */
+    lw          t2, 4(a1)         /* t2 = input_buf[1] */
+    lw          t3, 8(a1)         /* t3 = input_buf[2] */
+    lw          t8, 56(sp)        /* t8 = range_limit */
+    addiu       s1, t0, 0x7fff    /* s1 = 0x166e9 [FIX(1.40200)] */
+    addiu       s2, s1, 0x5eb9    /* s2 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s0, t0, 0x9916    /* s0 = 0x8000 */
+    addiu       s4, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s3, s4, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t0, a0, 1
+    sll         t4, a2, 2
+    lwx         s5, t4(t1)      /* s5 = inptr0 */
+    lwx         s6, t4(t2)      /* s6 = inptr1 */
+    lwx         s7, t4(t3)      /* s7 = inptr2 */
+    lw          t7, 0(a3)       /* t7 = outptr */
+    blez        t0, 2f
+     addu       t9, s6, t0      /* t9 = end address */
+1:
+    lbu         t2, 0(s6)       /* t2 = cb */
+    lbu         t0, 0(s7)       /* t0 = cr */
+    lbu         t1, 0(s5)       /* t1 = y */
+    addiu       t2, t2, -128    /* t2 = cb - 128 */
+    addiu       t0, t0, -128    /* t0 = cr - 128 */
+    mult        $ac1, s4, t2
+    madd        $ac1, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    t5, $ac1, 16
+    mulq_rs.w   t6, s2, t2      /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+    addiu       s7, s7, 1
+    addiu       s6, s6, 1
+    addu        t2, t1, t0      /* t2 = y + cred */
+    addu        t3, t1, t5      /* t3 = y + cgreen */
+    addu        t4, t1, t6      /* t4 = y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t1, 1(s5)
+    lbu         v0, 0(t2)
+    lbu         v1, 0(t3)
+    lbu         ra, 0(t4)
+    addu        t2, t1, t0
+    addu        t3, t1, t5
+    addu        t4, t1, t6
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne         t9, s6, 1b
+     addiu      s5, s5, 2
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     nop
+3:
+    lbu         t2, 0(s6)
+    lbu         t0, 0(s7)
+    lbu         t1, 0(s5)
+    addiu       t2, t2, -128    /* (cb - 128) */
+    addiu       t0, t0, -128    /* (cr - 128) */
+    mul         t3, s4, t2
+    mul         t4, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* (C1*cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   t6, s2, t2      /* (C2*cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, t3, s0
+    addu        t3, t4, t3
+    sra         t5, t3, 16      /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+    addu        t2, t1, t0      /* y + cred */
+    addu        t3, t1, t5      /* y + cgreen */
+    addu        t4, t1, t6      /* y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li            s4, 0
+    lw            s2, 0(a3)       /* s2 = *output_data_ptr */
+0:
+    li            t9, 2
+    lw            s1, -4(a2)      /* s1 = inptr1 */
+
+1:
+    lw            s0, 0(a2)       /* s0 = inptr0 */
+    lwx           s3, s4(s2)
+    addiu         s5, a1, -2      /* s5 = downsampled_width - 2 */
+    srl           t4, s5, 1
+    sll           t4, t4, 1
+    lbu           t0, 0(s0)
+    lbu           t1, 1(s0)
+    lbu           t2, 0(s1)
+    lbu           t3, 1(s1)
+    addiu         s0, 2
+    addiu         s1, 2
+    addu          t8, s0, t4      /* t8 = end address */
+    andi          s5, s5, 1       /* s5 = residual */
+    sll           t4, t0, 1
+    sll           t6, t1, 1
+    addu          t0, t0, t4      /* t0 = (*inptr0++) * 3 */
+    addu          t1, t1, t6      /* t1 = (*inptr0++) * 3 */
+    addu          t7, t0, t2      /* t7 = thiscolsum */
+    addu          t6, t1, t3      /* t5 = nextcolsum */
+    sll           t0, t7, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t7      /* t1 = thiscolsum * 3 */
+    shra_r.w      t0, t0, 4
+    addiu         t1, 7
+    addu          t1, t1, t6
+    srl           t1, t1, 4
+    sb            t0, 0(s3)
+    sb            t1, 1(s3)
+    beq           t8, s0, 22f     /* skip to final iteration if width == 3 */
+     addiu        s3, 2
+2:
+    lh            t0, 0(s0)       /* t0 = A3|A2 */
+    lh            t2, 0(s1)       /* t2 = B3|B2 */
+    addiu         s0, 2
+    addiu         s1, 2
+    preceu.ph.qbr t0, t0          /* t0 = 0|A3|0|A2 */
+    preceu.ph.qbr t2, t2          /* t2 = 0|B3|0|B2 */
+    shll.ph       t1, t0, 1
+    sll           t3, t6, 1
+    addu.ph       t0, t1, t0      /* t0 = A3*3|A2*3 */
+    addu          t3, t3, t6      /* t3 = this * 3 */
+    addu.ph       t0, t0, t2      /* t0 = next2|next1 */
+    addu          t1, t3, t7
+    andi          t7, t0, 0xFFFF  /* t7 = next1 */
+    sll           t2, t7, 1
+    addu          t2, t7, t2      /* t2 = next1*3 */
+    addu          t4, t2, t6
+    srl           t6, t0, 16      /* t6 = next2 */
+    shra_r.w      t1, t1, 4       /* t1 = (this*3 + last + 8) >> 4 */
+    addu          t0, t3, t7
+    addiu         t0, 7
+    srl           t0, t0, 4       /* t0 = (this*3 + next1 + 7) >> 4 */
+    shra_r.w      t4, t4, 4       /* t3 = (next1*3 + this + 8) >> 4 */
+    addu          t2, t2, t6
+    addiu         t2, 7
+    srl           t2, t2, 4       /* t2 = (next1*3 + next2 + 7) >> 4 */
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    sb            t4, 2(s3)
+    sb            t2, 3(s3)
+    bne           t8, s0, 2b
+     addiu        s3, 4
+22:
+    beqz          s5, 4f
+     addu         t8, s0, s5
+3:
+    lbu           t0, 0(s0)
+    lbu           t2, 0(s1)
+    addiu         s0, 1
+    addiu         s1, 1
+    sll           t3, t6, 1
+    sll           t1, t0, 1
+    addu          t1, t0, t1      /* t1 = inptr0 * 3 */
+    addu          t3, t3, t6      /* t3 = thiscolsum * 3 */
+    addu          t5, t1, t2
+    addu          t1, t3, t7
+    shra_r.w      t1, t1, 4
+    addu          t0, t3, t5
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         s3, 2
+    move          t7, t6
+    bne           t8, s0, 3b
+     move         t6, t5
+4:
+    sll           t0, t6, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t6      /* t1 = thiscolsum * 3 */
+    addu          t1, t1, t7
+    addiu         s4, 4
+    shra_r.w      t1, t1, 4
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         t9, -1
+    addiu         s3, 2
+    bnez          t9, 1b
+     lw           s1, 4(a2)
+    srl           t0, s4, 2
+    subu          t0, a0, t0
+    bgtz          t0, 0b
+     addiu        a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j             ra
+     nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz          a0, 3f
+     sll          t0, a0, 2
+    lw            s1, 0(a3)
+    li            s3, 0x10001
+    addu          s0, s1, t0
+0:
+    addiu         t8, a1, -2
+    srl           t9, t8, 2
+    lw            t7, 0(a2)
+    lw            s2, 0(s1)
+    lbu           t0, 0(t7)
+    lbu           t1, 1(t7)       /* t1 = inptr[1] */
+    sll           t2, t0, 1
+    addu          t2, t2, t0      /* t2 = invalue*3 */
+    addu          t2, t2, t1
+    shra_r.w      t2, t2, 2
+    sb            t0, 0(s2)
+    sb            t2, 1(s2)
+    beqz          t9, 11f
+     addiu        s2, 2
+1:
+    ulw           t0, 0(t7)       /* t0 = |P3|P2|P1|P0| */
+    ulw           t1, 1(t7)
+    ulh           t2, 4(t7)       /* t2 = |0|0|P5|P4| */
+    preceu.ph.qbl t3, t0          /* t3 = |0|P3|0|P2| */
+    preceu.ph.qbr t0, t0          /* t0 = |0|P1|0|P0| */
+    preceu.ph.qbr t2, t2          /* t2 = |0|P5|0|P4| */
+    preceu.ph.qbl t4, t1          /* t4 = |0|P4|0|P3| */
+    preceu.ph.qbr t1, t1          /* t1 = |0|P2|0|P1| */
+    shll.ph       t5, t4, 1
+    shll.ph       t6, t1, 1
+    addu.ph       t5, t5, t4      /* t5 = |P4*3|P3*3| */
+    addu.ph       t6, t6, t1      /* t6 = |P2*3|P1*3| */
+    addu.ph       t4, t3, s3
+    addu.ph       t0, t0, s3
+    addu.ph       t4, t4, t5
+    addu.ph       t0, t0, t6
+    shrl.ph       t4, t4, 2       /* t4 = |0|P3|0|P2| */
+    shrl.ph       t0, t0, 2       /* t0 = |0|P1|0|P0| */
+    addu.ph       t2, t2, t5
+    addu.ph       t3, t3, t6
+    shra_r.ph     t2, t2, 2       /* t2 = |0|P5|0|P4| */
+    shra_r.ph     t3, t3, 2       /* t3 = |0|P3|0|P2| */
+    shll.ph       t2, t2, 8
+    shll.ph       t3, t3, 8
+    or            t2, t4, t2
+    or            t3, t3, t0
+    addiu         t9, -1
+    usw           t3, 0(s2)
+    usw           t2, 4(s2)
+    addiu         s2, 8
+    bgtz          t9, 1b
+     addiu        t7, 4
+11:
+    andi          t8, 3
+    beqz          t8, 22f
+     addiu        t7, 1
+
+2:
+    lbu           t0, 0(t7)
+    addiu         t7, 1
+    sll           t1, t0, 1
+    addu          t2, t0, t1      /* t2 = invalue */
+    lbu           t3, -2(t7)
+    lbu           t4, 0(t7)
+    addiu         t3, 1
+    addiu         t4, 2
+    addu          t3, t3, t2
+    addu          t4, t4, t2
+    srl           t3, 2
+    srl           t4, 2
+    sb            t3, 0(s2)
+    sb            t4, 1(s2)
+    addiu         t8, -1
+    bgtz          t8, 2b
+     addiu        s2, 2
+
+22:
+    lbu           t0, 0(t7)
+    lbu           t2, -1(t7)
+    sll           t1, t0, 1
+    addu          t1, t1, t0      /* t1 = invalue * 3 */
+    addu          t1, t1, t2
+    addiu         t1, 1
+    srl           t1, t1, 2
+    sb            t1, 0(s2)
+    sb            t0, 1(s2)
+    addiu         s1, 4
+    bne           s1, s0, 0b
+     addiu        a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j             ra
+     nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)      /* s1 = output_data */
+    lw          s0, 40(sp)      /* s0 = input_data */
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* t0 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    li          s3, 0           /* s3 = bias */
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3      /* t2 = pixval1 */
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3      /* t3 = pixval2 */
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2      /* t5 = loop_end2 */
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz        a2, 8f
+     lw         s1, 52(sp)      /* s1 = output_data */
+    lw          s0, 48(sp)      /* s0 = input_data */
+
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+    andi        t9, a0, 2
+    srl         s2, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* s2 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    lw          s7, 4(s0)       /* s7 = inptr1 */
+    li          s6, 1           /* s6 = bias */
+2:
+    ulw         t0, 0(t5)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t1, 0(s7)       /* t1 = |Q3|Q2|Q1|Q0| */
+    ulw         t2, 4(t5)
+    ulw         t3, 4(s7)
+    precrq.ph.w t7, t0, t1      /* t2 = |P3|P2|Q3|Q2| */
+    ins         t0, t1, 16, 16  /* t0 = |Q1|Q0|P1|P0| */
+    raddu.w.qb  t1, t7
+    raddu.w.qb  t0, t0
+    shra_r.w    t1, t1, 2
+    addiu       t0, 1
+    srl         t0, 2
+    precrq.ph.w t7, t2, t3
+    ins         t2, t3, 16, 16
+    raddu.w.qb  t7, t7
+    raddu.w.qb  t2, t2
+    shra_r.w    t7, t7, 2
+    addiu       t2, 1
+    srl         t2, 2
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t7, 3(t4)
+    addiu       t4, 4
+    addiu       t5, 8
+    addiu       s4, s4, -1
+    bgtz        s4, 2b
+     addiu      s7, 8
+    beqz        t8, 4f
+     addu       t8, t4, t8
+3:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 0(s7)
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t0, t0
+    addu        t0, t0, s6
+    srl         t0, 2
+    xori        s6, s6, 3
+    sb          t0, 0(t4)
+    addiu       t5, 2
+    addiu       t4, 1
+    bne         t8, t4, 3b
+     addiu      s7, 2
+4:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    lbux        t0, t6(s7)
+    sll         t0, 1
+    addu        t1, t1, t0
+    addu        t3, t1, s6
+    srl         t0, t3, 2       /* t2 = pixval1 */
+    xori        s6, s6, 3
+    addu        t2, t1, s6
+    srl         t1, t2, 2       /* t3 = pixval2 */
+    blez        s2, 6f
+     append     t1, t0, 8
+5:
+    ush         t1, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 5b
+     addiu      t4, 2
+6:
+    beqz        t9, 7f
+     nop
+    sb          t0, 0(t4)
+7:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0     = input_data
+ * a1     = output_data
+ * a2     = compptr->v_samp_factor
+ * a3     = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      /* compptr->width_in_blocks */
+    lw          s0, 56(sp)      /* cinfo->image_width */
+    lw          s6, 48(sp)      /* cinfo->smoothing_factor */
+    sll         s7, 3           /* output_cols = width_in_blocks * DCTSIZE */
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      /* t6 = 16384 - tmp_smoot_f * 80 */
+    sll         t7, s6, 4       /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      /*  outptr = output_data[outrow] */
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      /* inptr0 = input_data[inrow] */
+    lwx         t9, t9(a0)      /* inptr1 = input_data[inrow+1] */
+    lwx         s0, s0(a0)      /* above_ptr = input_data[inrow-1] */
+    lwx         s1, s1(a0)      /* below_ptr = input_data[inrow+2] */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      /* end address */
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      /* end address */
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0     = upsample->h_expand[compptr->component_index]
+ * a1     = upsample->v_expand[compptr->component_index]
+ * a2     = input_data
+ * a3     = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw          s0, 0(a3)       /* s0 = output_data */
+    lw          s1, 32(sp)      /* s1 = cinfo->output_width */
+    lw          s2, 36(sp)      /* s2 = cinfo->max_v_samp_factor */
+    li          t6, 0           /* t6 = inrow */
+    beqz        s2, 10f
+     li         s3, 0           /* s3 = outrow */
+0:
+    addu        t0, a2, t6
+    addu        t7, s0, s3
+    lw          t3, 0(t0)       /* t3 = inptr */
+    lw          t8, 0(t7)       /* t8 = outptr */
+    beqz        s1, 4f
+     addu       t5, t8, s1      /* t5 = outend */
+1:
+    lb          t2, 0(t3)       /* t2 = invalue = *inptr++ */
+    addiu       t3, 1
+    beqz        a0, 3f
+     move       t0, a0          /* t0 = h_expand */
+2:
+    sb          t2, 0(t8)
+    addiu       t0, -1
+    bgtz        t0, 2b
+     addiu      t8, 1
+3:
+    bgt         t5, t8, 1b
+     nop
+4:
+    addiu       t9, a1, -1      /* t9 = v_expand - 1 */
+    blez        t9, 9f
+     nop
+5:
+    lw          t3, 0(s0)
+    lw          t4, 4(s0)
+    subu        t0, s1, 0xF
+    blez        t0, 7f
+     addu       t5, t3, s1      /* t5 = end address */
+    andi        t7, s1, 0xF     /* t7 = residual */
+    subu        t8, t5, t7
+6:
+    ulw         t0, 0(t3)
+    ulw         t1, 4(t3)
+    ulw         t2, 8(t3)
+    usw         t0, 0(t4)
+    ulw         t0, 12(t3)
+    usw         t1, 4(t4)
+    usw         t2, 8(t4)
+    usw         t0, 12(t4)
+    addiu       t3, 16
+    bne         t3, t8, 6b
+     addiu      t4, 16
+    beqz        t7, 8f
+     nop
+7:
+    lbu         t0, 0(t3)
+    sb          t0, 0(t4)
+    addiu       t3, 1
+    bne         t3, t5, 7b
+     addiu      t4, 1
+8:
+    addiu       t9, -1
+    bgtz        t9, 5b
+     addiu      s0, 8
+9:
+    addu        s3, s3, a1
+    bne         s3, s2, 0b
+     addiu      t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)       /* t7 = output_data */
+    andi        t8, a1, 0xf     /* t8 = residual */
+    sll         t0, a0, 2
+    blez        a0, 4f
+     addu       t9, t7, t0      /* t9 = output_data end address */
+0:
+    lw          t5, 0(t7)       /* t5 = outptr */
+    lw          t6, 0(a2)       /* t6 = inptr */
+    addu        t3, t5, a1      /* t3 = outptr + output_width (end address) */
+    subu        t3, t8          /* t3 = end address - residual */
+    beq         t5, t3, 2f
+     move       t4, t8
+1:
+    ulw         t0, 0(t6)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t2, 4(t6)       /* t2 = |P7|P6|P5|P4| */
+    srl         t1, t0, 16      /* t1 = |X|X|P3|P2| */
+    ins         t0, t0, 16, 16  /* t0 = |P1|P0|P1|P0| */
+    ins         t1, t1, 16, 16  /* t1 = |P3|P2|P3|P2| */
+    ins         t0, t0, 8, 16   /* t0 = |P1|P1|P0|P0| */
+    ins         t1, t1, 8, 16   /* t1 = |P3|P3|P2|P2| */
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t0, t2, 16      /* t0 = |X|X|P7|P6| */
+    ins         t2, t2, 16, 16  /* t2 = |P5|P4|P5|P4| */
+    ins         t0, t0, 16, 16  /* t0 = |P7|P6|P7|P6| */
+    ins         t2, t2, 8, 16   /* t2 = |P5|P5|P4|P4| */
+    ins         t0, t0, 8, 16   /* t0 = |P7|P7|P6|P6| */
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t5, 16
+    bne         t5, t3, 1b
+     addiu      t6, 8
+    beqz        t8, 3f
+     move       t4, t8
+2:
+    lbu         t1, 0(t6)
+    sb          t1, 0(t5)
+    sb          t1, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    addiu       t7, 4
+    bne         t9, t7, 0b
+     addiu      a2, 4
+4:
+    j           ra
+     nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)
+    blez        a0, 7f
+     andi       t9, a1, 0xf     /* t9 = residual */
+0:
+    lw          t6, 0(a2)       /* t6 = inptr */
+    lw          t5, 0(t7)       /* t5 = outptr */
+    addu        t8, t5, a1      /* t8 = outptr end address */
+    subu        t8, t9          /* t8 = end address - residual */
+    beq         t5, t8, 2f
+     move       t4, t9
+1:
+    ulw         t0, 0(t6)
+    srl         t1, t0, 16
+    ins         t0, t0, 16, 16
+    ins         t0, t0, 8, 16
+    ins         t1, t1, 16, 16
+    ins         t1, t1, 8, 16
+    ulw         t2, 4(t6)
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t3, t2, 16
+    ins         t2, t2, 16, 16
+    ins         t2, t2, 8, 16
+    ins         t3, t3, 16, 16
+    ins         t3, t3, 8, 16
+    usw         t2, 8(t5)
+    usw         t3, 12(t5)
+    addiu       t5, 16
+    bne         t5, t8, 1b
+     addiu      t6, 8
+    beqz        t9, 3f
+     move       t4, t9
+2:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    sb          t0, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    lw          t6, 0(t7)       /* t6 = outptr[0] */
+    lw          t5, 4(t7)       /* t5 = outptr[1] */
+    addu        t4, t6, a1      /* t4 = new end address */
+    beq         a1, t9, 5f
+     subu       t8, t4, t9
+4:
+    ulw         t0, 0(t6)
+    ulw         t1, 4(t6)
+    ulw         t2, 8(t6)
+    usw         t0, 0(t5)
+    ulw         t0, 12(t6)
+    usw         t1, 4(t5)
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t6, 16
+    bne         t6, t8, 4b
+     addiu      t5, 16
+    beqz        t9, 6f
+     nop
+5:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    addiu       t6, 1
+    bne         t6, t4, 5b
+     addiu      t5, 1
+6:
+    addiu       t7, 8
+    addiu       a0, -2
+    bgtz        a0, 0b
+     addiu      a2, 4
+7:
+    j           ra
+     nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -256
+    move        v0, sp
+    addiu       v1, zero, 8     /* v1 = DCTSIZE = 8 */
+1:
+    lh          s4, 32(a0)      /* s4 = inptr[16] */
+    lh          s5, 64(a0)      /* s5 = inptr[32] */
+    lh          s6, 96(a0)      /* s6 = inptr[48] */
+    lh          t1, 112(a0)     /* t1 = inptr[56] */
+    lh          t7, 16(a0)      /* t7 = inptr[8] */
+    lh          t5, 80(a0)      /* t5 = inptr[40] */
+    lh          t3, 48(a0)      /* t3 = inptr[24] */
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, t5
+    or          s4, s4, t7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 2f
+     addiu      v1, v1, -1
+    lh          s5, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lh          s6, 0(a0)       /* inptr[DCTSIZE*0] */
+    mul         s5, s5, s6      /* DEQUANTIZE(inptr[0], quantptr[0]) */
+    sll         s5, s5, 2
+    sw          s5, 0(v0)
+    sw          s5, 32(v0)
+    sw          s5, 64(v0)
+    sw          s5, 96(v0)
+    sw          s5, 128(v0)
+    sw          s5, 160(v0)
+    sw          s5, 192(v0)
+    b           3f
+     sw         s5, 224(v0)
+2:
+    lh          t0, 112(a1)
+    lh          t2, 48(a1)
+    lh          t4, 80(a1)
+    lh          t6, 16(a1)
+    mul         t0, t0, t1      /* DEQUANTIZE(inptr[DCTSIZE*7],
+                                              quantptr[DCTSIZE*7]) */
+    mul         t1, t2, t3      /* DEQUANTIZE(inptr[DCTSIZE*3],
+                                              quantptr[DCTSIZE*3]) */
+    mul         t2, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*5],
+                                              quantptr[DCTSIZE*5]) */
+    mul         t3, t6, t7      /* DEQUANTIZE(inptr[DCTSIZE*1],
+                                              quantptr[DCTSIZE*1]) */
+    lh          t4, 32(a1)
+    lh          t5, 32(a0)
+    lh          t6, 96(a1)
+    lh          t7, 96(a0)
+    addu        s0, t0, t1       /* z3 = tmp0 + tmp2 */
+    addu        s1, t1, t2       /* z2 = tmp1 + tmp2 */
+    addu        s2, t2, t3       /* z4 = tmp1 + tmp3 */
+    addu        s3, s0, s2       /* z3 + z4 */
+    addiu       t9, zero, 9633   /* FIX_1_175875602 */
+    mul         s3, s3, t9       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        t8, t0, t3       /* z1 = tmp0 + tmp3 */
+    addiu       t9, zero, 2446   /* FIX_0_298631336 */
+    mul         t0, t0, t9       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t9, zero, 16819  /* FIX_2_053119869 */
+    mul         t2, t2, t9       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t9, zero, 25172  /* FIX_3_072711026 */
+    mul         t1, t1, t9       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t9, zero, 12299  /* FIX_1_501321110 */
+    mul         t3, t3, t9       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    addiu       t9, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t9       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t9, zero, 3196   /* FIX_0_390180644 */
+    mul         s2, s2, t9       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t9, zero, 7373   /* FIX_0_899976223 */
+    mul         t8, t8, t9       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t9, zero, 20995  /* FIX_2_562915447 */
+    mul         s1, s1, t9       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    subu        s0, s3, s0       /* z3 += z5 */
+    addu        t0, t0, s0       /* tmp0 += z3 */
+    addu        t1, t1, s0       /* tmp2 += z3 */
+    subu        s2, s3, s2       /* z4 += z5 */
+    addu        t2, t2, s2       /* tmp1 += z4 */
+    addu        t3, t3, s2       /* tmp3 += z4 */
+    subu        t0, t0, t8       /* tmp0 += z1 */
+    subu        t1, t1, s1       /* tmp2 += z2 */
+    subu        t2, t2, s1       /* tmp1 += z2 */
+    subu        t3, t3, t8       /* tmp3 += z1 */
+    mul         s0, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                               quantptr[DCTSIZE*2]) */
+    addiu       t9, zero, 6270   /* FIX_0_765366865 */
+    mul         s1, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                               quantptr[DCTSIZE*6]) */
+    lh          t4, 0(a1)
+    lh          t5, 0(a0)
+    lh          t6, 64(a1)
+    lh          t7, 64(a0)
+    mul         s2, t9, s0       /* MULTIPLY(z2, FIX_0_765366865) */
+    mul         t5, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                               quantptr[DCTSIZE*0]) */
+    mul         t6, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                               quantptr[DCTSIZE*4]) */
+    addiu       t9, zero, 4433   /* FIX_0_541196100 */
+    addu        s3, s0, s1       /* z2 + z3 */
+    mul         s3, s3, t9       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t9, zero, 15137  /* FIX_1_847759065 */
+    mul         t8, s1, t9       /* MULTIPLY(z3, FIX_1_847759065) */
+    addu        t4, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 13      /* tmp0 = (z2 + z3) << CONST_BITS */
+    sll         t5, t5, 13      /* tmp1 = (z2 - z3) << CONST_BITS */
+    addu        t7, s3, s2      /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t6, s3, t8      /* tmp2 =
+                                     z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    addu        s0, t4, t7
+    subu        s1, t4, t7
+    addu        s2, t5, t6
+    subu        s3, t5, t6
+    addu        t4, s0, t3
+    subu        s0, s0, t3
+    addu        t3, s2, t1
+    subu        s2, s2, t1
+    addu        t1, s3, t2
+    subu        s3, s3, t2
+    addu        t2, s1, t0
+    subu        s1, s1, t0
+    shra_r.w    t4, t4, 11
+    shra_r.w    t3, t3, 11
+    shra_r.w    t1, t1, 11
+    shra_r.w    t2, t2, 11
+    shra_r.w    s1, s1, 11
+    shra_r.w    s3, s3, 11
+    shra_r.w    s2, s2, 11
+    shra_r.w    s0, s0, 11
+    sw          t4, 0(v0)
+    sw          t3, 32(v0)
+    sw          t1, 64(v0)
+    sw          t2, 96(v0)
+    sw          s1, 128(v0)
+    sw          s3, 160(v0)
+    sw          s2, 192(v0)
+    sw          s0, 224(v0)
+3:
+    addiu       a1, a1, 2
+    addiu       a0, a0, 2
+    bgtz        v1, 1b
+     addiu      v0, v0, 4
+    move        v0, sp
+    addiu       v1, zero, 8
+4:
+    lw          t0, 8(v0)       /* z2 = (JLONG)wsptr[2] */
+    lw          t1, 24(v0)      /* z3 = (JLONG)wsptr[6] */
+    lw          t2, 0(v0)       /* (JLONG)wsptr[0] */
+    lw          t3, 16(v0)      /* (JLONG)wsptr[4] */
+    lw          s4, 4(v0)       /* (JLONG)wsptr[1] */
+    lw          s5, 12(v0)      /* (JLONG)wsptr[3] */
+    lw          s6, 20(v0)      /* (JLONG)wsptr[5] */
+    lw          s7, 28(v0)      /* (JLONG)wsptr[7] */
+    or          s4, s4, t0
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, s7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 5f
+     addiu      v1, v1, -1
+    shra_r.w    s5, t2, 5
+    andi        s5, s5, 0x3ff
+    lbux        s5, s5(a3)
+    lw          s1, 0(a2)
+    replv.qb    s5, s5
+    usw         s5, 0(s1)
+    usw         s5, 4(s1)
+    b           6f
+     nop
+5:
+    addu        t4, t0, t1       /* z2 + z3 */
+    addiu       t8, zero, 4433   /* FIX_0_541196100 */
+    mul         t5, t4, t8       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t8, zero, 15137  /* FIX_1_847759065 */
+    mul         t1, t1, t8       /* MULTIPLY(z3, FIX_1_847759065) */
+    addiu       t8, zero, 6270   /* FIX_0_765366865 */
+    mul         t0, t0, t8       /* MULTIPLY(z2, FIX_0_765366865) */
+    addu        t4, t2, t3       /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+    subu        t2, t2, t3       /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+    sll         t4, t4, 13       /* tmp0 =
+                                      (wsptr[0] + wsptr[4]) << CONST_BITS */
+    sll         t2, t2, 13       /* tmp1 =
+                                      (wsptr[0] - wsptr[4]) << CONST_BITS */
+    subu        t1, t5, t1       /* tmp2 =
+                                      z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    subu        t3, t2, t1       /* tmp12 = tmp1 - tmp2 */
+    addu        t2, t2, t1       /* tmp11 = tmp1 + tmp2 */
+    addu        t5, t5, t0       /* tmp3 =
+                                      z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t1, t4, t5       /* tmp13 = tmp0 - tmp3 */
+    addu        t0, t4, t5       /* tmp10 = tmp0 + tmp3 */
+    lw          t4, 28(v0)       /* tmp0 = (JLONG)wsptr[7] */
+    lw          t6, 12(v0)       /* tmp2 = (JLONG)wsptr[3] */
+    lw          t5, 20(v0)       /* tmp1 = (JLONG)wsptr[5] */
+    lw          t7, 4(v0)        /* tmp3 = (JLONG)wsptr[1] */
+    addu        s0, t4, t6       /* z3 = tmp0 + tmp2 */
+    addiu       t8, zero, 9633   /* FIX_1_175875602 */
+    addu        s1, t5, t7       /* z4 = tmp1 + tmp3 */
+    addu        s2, s0, s1       /* z3 + z4 */
+    mul         s2, s2, t8       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        s3, t4, t7       /* z1 = tmp0 + tmp3 */
+    addu        t9, t5, t6       /* z2 = tmp1 + tmp2 */
+    addiu       t8, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t8       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t8, zero, 3196   /* FIX_0_390180644 */
+    mul         s1, s1, t8       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t8, zero, 2446   /* FIX_0_298631336 */
+    mul         t4, t4, t8       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t8, zero, 7373   /* FIX_0_899976223 */
+    mul         s3, s3, t8       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t8, zero, 16819  /* FIX_2_053119869 */
+    mul         t5, t5, t8       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t8, zero, 20995  /* FIX_2_562915447 */
+    mul         t9, t9, t8       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    addiu       t8, zero, 25172  /* FIX_3_072711026 */
+    mul         t6, t6, t8       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t8, zero, 12299  /* FIX_1_501321110 */
+    mul         t7, t7, t8       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    subu        s0, s2, s0       /* z3 += z5 */
+    subu        s1, s2, s1       /* z4 += z5 */
+    addu        t4, t4, s0
+    subu        t4, t4, s3      /* tmp0 */
+    addu        t5, t5, s1
+    subu        t5, t5, t9      /* tmp1 */
+    addu        t6, t6, s0
+    subu        t6, t6, t9      /* tmp2 */
+    addu        t7, t7, s1
+    subu        t7, t7, s3      /* tmp3 */
+    addu        s0, t0, t7
+    subu        t0, t0, t7
+    addu        t7, t2, t6
+    subu        t2, t2, t6
+    addu        t6, t3, t5
+    subu        t3, t3, t5
+    addu        t5, t1, t4
+    subu        t1, t1, t4
+    shra_r.w    s0, s0, 18
+    shra_r.w    t7, t7, 18
+    shra_r.w    t6, t6, 18
+    shra_r.w    t5, t5, 18
+    shra_r.w    t1, t1, 18
+    shra_r.w    t3, t3, 18
+    shra_r.w    t2, t2, 18
+    shra_r.w    t0, t0, 18
+    andi        s0, s0, 0x3ff
+    andi        t7, t7, 0x3ff
+    andi        t6, t6, 0x3ff
+    andi        t5, t5, 0x3ff
+    andi        t1, t1, 0x3ff
+    andi        t3, t3, 0x3ff
+    andi        t2, t2, 0x3ff
+    andi        t0, t0, 0x3ff
+    lw          s1, 0(a2)
+    lbux        s0, s0(a3)
+    lbux        t7, t7(a3)
+    lbux        t6, t6(a3)
+    lbux        t5, t5(a3)
+    lbux        t1, t1(a3)
+    lbux        t3, t3(a3)
+    lbux        t2, t2(a3)
+    lbux        t0, t0(a3)
+    sb          s0, 0(s1)
+    sb          t7, 1(s1)
+    sb          t6, 2(s1)
+    sb          t5, 3(s1)
+    sb          t1, 4(s1)
+    sb          t3, 5(s1)
+    sb          t2, 6(s1)
+    sb          t0, 7(s1)
+6:
+    addiu       v0, v0, 32
+    bgtz        v1, 4b
+     addiu      a2, a2, 4
+    addiu       sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu         t9, a0, 16      /* end address */
+    or            AT, a3, zero
+
+0:
+    lw            s0, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lw            t0, 0(a0)       /* inptr[DCTSIZE*0] */
+    lw            t1, 16(a0)      /* inptr[DCTSIZE*1] */
+    muleq_s.w.phl v0, t0, s0      /* tmp0 ... */
+    lw            t2, 32(a0)      /* inptr[DCTSIZE*2] */
+    lw            t3, 48(a0)      /* inptr[DCTSIZE*3] */
+    lw            t4, 64(a0)      /* inptr[DCTSIZE*4] */
+    lw            t5, 80(a0)      /* inptr[DCTSIZE*5] */
+    muleq_s.w.phr t0, t0, s0      /* ... tmp0 ... */
+    lw            t6, 96(a0)      /* inptr[DCTSIZE*6] */
+    lw            t7, 112(a0)     /* inptr[DCTSIZE*7] */
+    or            s4, t1, t2
+    or            s5, t3, t4
+    bnez          s4, 1f
+     ins          t0, v0, 16, 16  /* ... tmp0 */
+    bnez          s5, 1f
+     or           s6, t5, t6
+    or            s6, s6, t7
+    bnez          s6, 1f
+     sw           t0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    sw            t0, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            t0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            t0, 48(a2)      /* wsptr[DCTSIZE*3] */
+    sw            t0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            t0, 80(a2)      /* wsptr[DCTSIZE*5] */
+    sw            t0, 96(a2)      /* wsptr[DCTSIZE*6] */
+    sw            t0, 112(a2)     /* wsptr[DCTSIZE*7] */
+    addiu         a0, a0, 4
+    b             2f
+     addiu        a1, a1, 4
+
+1:
+    lw            s1, 32(a1)      /* quantptr[DCTSIZE*2] */
+    lw            s2, 64(a1)      /* quantptr[DCTSIZE*4] */
+    muleq_s.w.phl v0, t2, s1      /* tmp1 ... */
+    muleq_s.w.phr t2, t2, s1      /* ... tmp1 ... */
+    lw            s0, 16(a1)      /* quantptr[DCTSIZE*1] */
+    lw            s1, 48(a1)      /* quantptr[DCTSIZE*3] */
+    lw            s3, 96(a1)      /* quantptr[DCTSIZE*6] */
+    muleq_s.w.phl v1, t4, s2      /* tmp2 ... */
+    muleq_s.w.phr t4, t4, s2      /* ... tmp2 ... */
+    lw            s2, 80(a1)      /* quantptr[DCTSIZE*5] */
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    ins           t2, v0, 16, 16  /* ... tmp1 */
+    muleq_s.w.phl v0, t6, s3      /* tmp3 ... */
+    muleq_s.w.phr t6, t6, s3      /* ... tmp3 ... */
+    ins           t4, v1, 16, 16  /* ... tmp2 */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    ins           t6, v0, 16, 16  /* ... tmp3 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    muleq_s.w.phl v0, t1, s0      /* tmp4 ... */
+    muleq_s.w.phr t1, t1, s0      /* ... tmp4 ... */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    lw            s3, 112(a1)     /* quantptr[DCTSIZE*7] */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    muleq_s.w.phl v1, t7, s3      /* tmp7 ... */
+    muleq_s.w.phr t7, t7, s3      /* ... tmp7 ... */
+    ins           t1, v0, 16, 16  /* ... tmp4 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    muleq_s.w.phl v0, t5, s2      /* tmp6 ... */
+    muleq_s.w.phr t5, t5, s2      /* ... tmp6 ... */
+    ins           t7, v1, 16, 16  /* ... tmp7 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    muleq_s.w.phl v1, t3, s1      /* tmp5 ... */
+    muleq_s.w.phr t3, t3, s1      /* ... tmp5 ... */
+    ins           t5, v0, 16, 16  /* ... tmp6 */
+    ins           t3, v1, 16, 16  /* ... tmp5 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7
+    subq.ph       s1, t0, t7
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    sw            s0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 4
+    addiu         a1, a1, 4
+    sw            s1, 112(a2)     /* wsptr[DCTSIZE*7] */
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    addq.ph       s2, t2, t5
+    addq.ph       t1, s4, t3      /* tmp4 */
+    subq.ph       s3, t2, t5
+    sw            s2, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            s3, 96(a2)      /* wsptr[DCTSIZE*6] */
+    addq.ph       v0, t4, t3
+    subq.ph       v1, t4, t3
+    sw            v0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            v1, 80(a2)      /* wsptr[DCTSIZE*5] */
+    addq.ph       v0, t6, t1
+    subq.ph       v1, t6, t1
+    sw            v0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            v1, 48(a2)      /* wsptr[DCTSIZE*3] */
+
+2:
+    bne           a0, t9, 0b
+     addiu        a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128     /* end address */
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)      /* restore $a3 (mips_idct_ifast_coefs) */
+    lw            t0, 0(a0)       /* wsptr[DCTSIZE*0+0/1]  b a */
+    lw            s0, 16(a0)      /* wsptr[DCTSIZE*1+0/1]  B A */
+    lw            t2, 4(a0)       /* wsptr[DCTSIZE*0+2/3]  d c */
+    lw            s2, 20(a0)      /* wsptr[DCTSIZE*1+2/3]  D C */
+    lw            t4, 8(a0)       /* wsptr[DCTSIZE*0+4/5]  f e */
+    lw            s4, 24(a0)      /* wsptr[DCTSIZE*1+4/5]  F E */
+    lw            t6, 12(a0)      /* wsptr[DCTSIZE*0+6/7]  h g */
+    lw            s6, 28(a0)      /* wsptr[DCTSIZE*1+6/7]  H G */
+    precrq.ph.w   t1, s0, t0      /* B b */
+    ins           t0, s0, 16, 16  /* A a */
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2       /* A a */
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0      /* A A */
+    ins           s0, s0, 16, 16  /* a a */
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0      /* A A A A */
+    precrq.qb.ph  s0, s0, s0      /* a a a a */
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7      /* tmp0 + tmp7 */
+    subq.ph       s7, t0, t7      /* tmp0 - tmp7 */
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3      /* tmp4 */
+    addq.ph       s1, t2, t5      /* tmp1 + tmp6 */
+    subq.ph       s6, t2, t5      /* tmp1 - tmp6 */
+    addq.ph       s2, t4, t3      /* tmp2 + tmp5 */
+    subq.ph       s5, t4, t3      /* tmp2 - tmp5 */
+    addq.ph       s4, t6, t1      /* tmp3 + tmp4 */
+    subq.ph       s3, t6, t1      /* tmp3 - tmp4 */
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0      /* B A */
+    ins           s0, s1, 16, 16  /* b a */
+    precrq.ph.w   t2, s3, s2      /* D C */
+    ins           s2, s3, 16, 16  /* d c */
+    precrq.ph.w   t4, s5, s4      /* F E */
+    ins           s4, s5, 16, 16  /* f e */
+    precrq.ph.w   t6, s7, s6      /* H G */
+    ins           s6, s7, 16, 16  /* h g */
+    precrq.qb.ph  t0, t2, t0      /* D C B A */
+    precrq.qb.ph  s0, s2, s0      /* d c b a */
+    precrq.qb.ph  t4, t6, t4      /* H G F E */
+    precrq.qb.ph  s4, s6, s4      /* h g f e */
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)       /* outptr[0/1/2/3]       d c b a */
+    sw            s4, 4(a3)       /* outptr[4/5/6/7]       h g f e */
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)       /* outptr[0/1/2/3]       D C B A */
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)       /* outptr[4/5/6/7]       H G F E */
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui         t0, 6437
+    ori         t0, 2260
+    lui         t1, 9633
+    ori         t1, 11363
+    lui         t2, 0xd39e
+    ori         t2, 0xe6dc
+    lui         t3, 0xf72d
+    ori         t3, 9633
+    lui         t4, 2261
+    ori         t4, 9633
+    lui         t5, 0xd39e
+    ori         t5, 6437
+    lui         t6, 9633
+    ori         t6, 0xd39d
+    lui         t7, 0xe6dc
+    ori         t7, 2260
+    lui         t8, 4433
+    ori         t8, 10703
+    lui         t9, 0xd630
+    ori         t9, 4433
+    li          s8, 8
+    move        a1, a0
+1:
+    lw          s0, 0(a1)       /* tmp0 = 1|0 */
+    lw          s1, 4(a1)       /* tmp1 = 3|2 */
+    lw          s2, 8(a1)       /* tmp2 = 5|4 */
+    lw          s3, 12(a1)      /* tmp3 = 7|6 */
+    packrl.ph   s1, s1, s1      /* tmp1 = 2|3 */
+    packrl.ph   s3, s3, s3      /* tmp3 = 6|7 */
+    subq.ph     s7, s1, s2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     s5, s0, s3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t0    /* ac0 += t5*  6437 + t4*  2260 */
+    dpa.w.ph    $ac0, s5, t1    /* ac0 += t6*  9633 + t7* 11363 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t2    /* ac1 += t5*-11362 + t4* -6436 */
+    dpa.w.ph    $ac1, s5, t3    /* ac1 += t6* -2259 + t7*  9633 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, s7, t4    /* ac2 += t5*  2261 + t4*  9633 */
+    dpa.w.ph    $ac2, s5, t5    /* ac2 += t6*-11362 + t7*  6437 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, s7, t6    /* ac3 += t5*  9633 + t4*-11363 */
+    dpa.w.ph    $ac3, s5, t7    /* ac3 += t6* -6436 + t7*  2260 */
+    addq.ph     s6, s1, s2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     s4, s0, s3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    extr_r.w    s2, $ac2, 11    /* tmp2 = (ac2 + 1024) >> 11 */
+    extr_r.w    s3, $ac3, 11    /* tmp3 = (ac3 + 1024) >> 11 */
+    addq.ph     s5, s4, s6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     s7, s4, s6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sh          s0, 2(a1)
+    sh          s1, 6(a1)
+    sh          s2, 10(a1)
+    sh          s3, 14(a1)
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t8    /* ac0 += t12*  4433 + t13* 10703 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t9    /* ac1 += t12*-10704 + t13*  4433 */
+    sra         s4, s5, 16      /* tmp4 = t11 */
+    addiu       a1, a1, 16
+    addiu       s8, s8, -1
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    addu        s2, s5, s4      /* tmp2 = t10 + t11 */
+    subu        s3, s5, s4      /* tmp3 = t10 - t11 */
+    sll         s2, s2, 2       /* tmp2 = (t10 + t11) << 2 */
+    sll         s3, s3, 2       /* tmp3 = (t10 - t11) << 2 */
+    sh          s2, -16(a1)
+    sh          s3, -8(a1)
+    sh          s0, -12(a1)
+    bgtz        s8, 1b
+     sh         s1, -4(a1)
+    li          t0, 2260
+    li          t1, 11363
+    li          t2, 9633
+    li          t3, 6436
+    li          t4, 6437
+    li          t5, 2261
+    li          t6, 11362
+    li          t7, 2259
+    li          t8, 4433
+    li          t9, 10703
+    li          a1, 10704
+    li          s8, 8
+
+2:
+    lh          a2, 0(a0)       /* 0 */
+    lh          a3, 16(a0)      /* 8 */
+    lh          v0, 32(a0)      /* 16 */
+    lh          v1, 48(a0)      /* 24 */
+    lh          s4, 64(a0)      /* 32 */
+    lh          s5, 80(a0)      /* 40 */
+    lh          s6, 96(a0)      /* 48 */
+    lh          s7, 112(a0)     /* 56 */
+    addu        s2, v0, s5      /* tmp2 = 16 + 40 */
+    subu        s5, v0, s5      /* tmp5 = 16 - 40 */
+    addu        s3, v1, s4      /* tmp3 = 24 + 32 */
+    subu        s4, v1, s4      /* tmp4 = 24 - 32 */
+    addu        s0, a2, s7      /* tmp0 =  0 + 56 */
+    subu        s7, a2, s7      /* tmp7 =  0 - 56 */
+    addu        s1, a3, s6      /* tmp1 =  8 + 48 */
+    subu        s6, a3, s6      /* tmp6 =  8 - 48 */
+    addu        a2, s0, s3      /* tmp10 = tmp0 + tmp3 */
+    subu        v1, s0, s3      /* tmp13 = tmp0 - tmp3 */
+    addu        a3, s1, s2      /* tmp11 = tmp1 + tmp2 */
+    subu        v0, s1, s2      /* tmp12 = tmp1 - tmp2 */
+    mult        s7, t1          /* ac0  = tmp7 * c1 */
+    madd        s4, t0          /* ac0 += tmp4 * c0 */
+    madd        s5, t4          /* ac0 += tmp5 * c4 */
+    madd        s6, t2          /* ac0 += tmp6 * c2 */
+    mult        $ac1, s7, t2    /* ac1  = tmp7 * c2 */
+    msub        $ac1, s4, t3    /* ac1 -= tmp4 * c3 */
+    msub        $ac1, s5, t6    /* ac1 -= tmp5 * c6 */
+    msub        $ac1, s6, t7    /* ac1 -= tmp6 * c7 */
+    mult        $ac2, s7, t4    /* ac2  = tmp7 * c4 */
+    madd        $ac2, s4, t2    /* ac2 += tmp4 * c2 */
+    madd        $ac2, s5, t5    /* ac2 += tmp5 * c5 */
+    msub        $ac2, s6, t6    /* ac2 -= tmp6 * c6 */
+    mult        $ac3, s7, t0    /* ac3  = tmp7 * c0 */
+    msub        $ac3, s4, t1    /* ac3 -= tmp4 * c1 */
+    madd        $ac3, s5, t2    /* ac3 += tmp5 * c2 */
+    msub        $ac3, s6, t3    /* ac3 -= tmp6 * c3 */
+    extr_r.w    s0, $ac0, 15    /* tmp0 = (ac0 + 16384) >> 15 */
+    extr_r.w    s1, $ac1, 15    /* tmp1 = (ac1 + 16384) >> 15 */
+    extr_r.w    s2, $ac2, 15    /* tmp2 = (ac2 + 16384) >> 15 */
+    extr_r.w    s3, $ac3, 15    /* tmp3 = (ac3 + 16384) >> 15 */
+    addiu       s8, s8, -1
+    addu        s4, a2, a3      /* tmp4 = tmp10 + tmp11 */
+    subu        s5, a2, a3      /* tmp5 = tmp10 - tmp11 */
+    sh          s0, 16(a0)
+    sh          s1, 48(a0)
+    sh          s2, 80(a0)
+    sh          s3, 112(a0)
+    mult        v0, t8          /* ac0  = tmp12 * c8 */
+    madd        v1, t9          /* ac0 += tmp13 * c9 */
+    mult        $ac1, v1, t8    /* ac1  = tmp13 * c8 */
+    msub        $ac1, v0, a1    /* ac1 -= tmp12 * c10 */
+    addiu       a0, a0, 2
+    extr_r.w    s6, $ac0, 15    /* tmp6 = (ac0 + 16384) >> 15 */
+    extr_r.w    s7, $ac1, 15    /* tmp7 = (ac1 + 16384) >> 15 */
+    shra_r.w    s4, s4, 2       /* tmp4 = (tmp4 + 2) >> 2 */
+    shra_r.w    s5, s5, 2       /* tmp5 = (tmp5 + 2) >> 2 */
+    sh          s4, -2(a0)
+    sh          s5, 62(a0)
+    sh          s6, 30(a0)
+    bgtz        s8, 2b
+     sh         s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr          ra
+     nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    li          a1, 0x014e014e  /* FIX_1_306562965 (334 << 16) |
+                                                   (334 & 0xffff) */
+    li          a2, 0x008b008b  /* FIX_0_541196100 (139 << 16) |
+                                                   (139 & 0xffff) */
+    li          a3, 0x00620062  /* FIX_0_382683433 (98 << 16) |
+                                                   (98 & 0xffff) */
+    li          s1, 0x00b500b5  /* FIX_0_707106781 (181 << 16) |
+                                                   (181 & 0xffff) */
+
+    move        v0, a0
+    addiu       v1, v0, 128     /* end address */
+
+0:
+    lw          t0, 0(v0)       /* tmp0 = 1|0 */
+    lw          t1, 4(v0)       /* tmp1 = 3|2 */
+    lw          t2, 8(v0)       /* tmp2 = 5|4 */
+    lw          t3, 12(v0)      /* tmp3 = 7|6 */
+    packrl.ph   t1, t1, t1      /* tmp1 = 2|3 */
+    packrl.ph   t3, t3, t3      /* tmp3 = 6|7 */
+    subq.ph     t7, t1, t2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     t5, t0, t3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    addq.ph     t6, t1, t2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     t4, t0, t3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    addq.ph     t8, t4, t6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     t9, t4, t6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sra         t4, t8, 16      /* tmp4 = t11 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t9, s1
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, t7, a3    /* ac1 += t4*98 + t5*98 */
+    dpsx.w.ph   $ac1, t5, a3    /* ac1 += t6*98 + t7*98 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, t7, a2    /* ac2 += t4*139 + t5*139 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, t5, a1    /* ac3 += t6*334 + t7*334 */
+    precrq.ph.w t0, t5, t7      /* t0 = t5|t6 */
+    addq.ph     t2, t8, t4      /* tmp2 = t10 + t11 */
+    subq.ph     t3, t8, t4      /* tmp3 = t10 - t11 */
+    extr.w      t4, $ac0, 8
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t0, s1    /* ac0 += t5*181 + t6*181 */
+    extr.w      t0, $ac1, 8     /* t0 = z5 */
+    extr.w      t1, $ac2, 8     /* t1 = MULTIPLY(tmp10, 139) */
+    extr.w      t7, $ac3, 8     /* t2 = MULTIPLY(tmp12, 334) */
+    extr.w      t8, $ac0, 8     /* t8 = z3 = MULTIPLY(tmp11, 181) */
+    add         t6, t1, t0      /* t6 = z2 */
+    add         t7, t7, t0      /* t7 = z4 */
+    subq.ph     t0, t5, t8      /* t0 = z13 = tmp7 - z3 */
+    addq.ph     t8, t5, t8      /* t9 = z11 = tmp7 + z3 */
+    addq.ph     t1, t0, t6      /* t1 = z13 + z2 */
+    subq.ph     t6, t0, t6      /* t6 = z13 - z2 */
+    addq.ph     t0, t8, t7      /* t0 = z11 + z4 */
+    subq.ph     t7, t8, t7      /* t7 = z11 - z4 */
+    addq.ph     t5, t4, t9
+    subq.ph     t4, t9, t4
+    sh          t2, 0(v0)
+    sh          t5, 4(v0)
+    sh          t3, 8(v0)
+    sh          t4, 12(v0)
+    sh          t1, 10(v0)
+    sh          t6, 6(v0)
+    sh          t0, 2(v0)
+    sh          t7, 14(v0)
+    addiu       v0, 16
+    bne         v1, v0, 0b
+     nop
+    move        v0, a0
+    addiu       v1, v0, 16
+
+1:
+    lh          t0, 0(v0)       /* 0 */
+    lh          t1, 16(v0)      /* 8 */
+    lh          t2, 32(v0)      /* 16 */
+    lh          t3, 48(v0)      /* 24 */
+    lh          t4, 64(v0)      /* 32 */
+    lh          t5, 80(v0)      /* 40 */
+    lh          t6, 96(v0)      /* 48 */
+    lh          t7, 112(v0)     /* 56 */
+    add         t8, t0, t7      /* t8 = tmp0 */
+    sub         t7, t0, t7      /* t7 = tmp7 */
+    add         t0, t1, t6      /* t0 = tmp1 */
+    sub         t1, t1, t6      /* t1 = tmp6 */
+    add         t6, t2, t5      /* t6 = tmp2 */
+    sub         t5, t2, t5      /* t5 = tmp5 */
+    add         t2, t3, t4      /* t2 = tmp3 */
+    sub         t3, t3, t4      /* t3 = tmp4 */
+    add         t4, t8, t2      /* t4 = tmp10 = tmp0 + tmp3 */
+    sub         t8, t8, t2      /* t8 = tmp13 = tmp0 - tmp3 */
+    sub         s0, t0, t6      /* s0 = tmp12 = tmp1 - tmp2 */
+    ins         t8, s0, 16, 16  /* t8 = tmp12|tmp13 */
+    add         t2, t0, t6      /* t2 = tmp11 = tmp1 + tmp2 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t8, s1    /* ac0 += t12*181 + t13*181 */
+    add         s0, t4, t2      /* t8 = tmp10+tmp11 */
+    sub         t4, t4, t2      /* t4 = tmp10-tmp11 */
+    sh          s0, 0(v0)
+    sh          t4, 64(v0)
+    extr.w      t2, $ac0, 8     /* z1 = MULTIPLY(tmp12+tmp13,
+                                                 FIX_0_707106781) */
+    addq.ph     t4, t8, t2      /* t9 = tmp13 + z1 */
+    subq.ph     t8, t8, t2      /* t2 = tmp13 - z1 */
+    sh          t4, 32(v0)
+    sh          t8, 96(v0)
+    add         t3, t3, t5      /* t3 = tmp10 = tmp4 + tmp5 */
+    add         t0, t5, t1      /* t0 = tmp11 = tmp5 + tmp6 */
+    add         t1, t1, t7      /* t1 = tmp12 = tmp6 + tmp7 */
+    andi        t4, a1, 0xffff
+    mul         s0, t1, t4
+    sra         s0, s0, 8       /* s0 = z4 =
+                                     MULTIPLY(tmp12, FIX_1_306562965) */
+    ins         t1, t3, 16, 16  /* t1 = tmp10|tmp12 */
+    mult        $0, $0          /* ac0  = 0 */
+    mulsa.w.ph  $ac0, t1, a3    /* ac0 += t10*98 - t12*98 */
+    extr.w      t8, $ac0, 8     /* z5 = MULTIPLY(tmp10-tmp12,
+                                                 FIX_0_382683433) */
+    add         t2, t7, t8      /* t2 = tmp7 + z5 */
+    sub         t7, t7, t8      /* t7 = tmp7 - z5 */
+    andi        t4, a2, 0xffff
+    mul         t8, t3, t4
+    sra         t8, t8, 8       /* t8 = z2 =
+                                     MULTIPLY(tmp10, FIX_0_541196100) */
+    andi        t4, s1, 0xffff
+    mul         t6, t0, t4
+    sra         t6, t6, 8       /* t6 = z3 =
+                                     MULTIPLY(tmp11, FIX_0_707106781) */
+    add         t0, t6, t8      /* t0 = z3 + z2 */
+    sub         t1, t6, t8      /* t1 = z3 - z2 */
+    add         t3, t6, s0      /* t3 = z3 + z4 */
+    sub         t4, t6, s0      /* t4 = z3 - z4 */
+    sub         t5, t2, t1      /* t5 = dataptr[5] */
+    sub         t6, t7, t0      /* t6 = dataptr[3] */
+    add         t3, t2, t3      /* t3 = dataptr[1] */
+    add         t4, t7, t4      /* t4 = dataptr[7] */
+    sh          t5, 80(v0)
+    sh          t6, 48(v0)
+    sh          t3, 16(v0)
+    sh          t4, 112(v0)
+    addiu       v0, 2
+    bne         v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu       v0, a2, 124     /* v0 = workspace_end */
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    lh          t2, 128(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    lh          t8, 386(a1)
+
+1:
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    sh          v1, 0(a0)
+
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t2, 128(a1)
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t8, 386(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    sh          s2, 2(a0)
+    lh          t0, 0(a2)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    bne         a2, v0, 1b
+     addiu      a0, a0, 4
+
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    sh          v1, 0(a0)
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    sh          s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j           ra
+     nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    li          t1, 0x46800100  /* integer representation 16384.5 */
+    mtc1        t1, f0
+    li          t0, 63
+0:
+    lwc1        f2, 0(a2)
+    lwc1        f10, 0(a1)
+    lwc1        f4, 4(a2)
+    lwc1        f12, 4(a1)
+    lwc1        f6, 8(a2)
+    lwc1        f14, 8(a1)
+    lwc1        f8, 12(a2)
+    lwc1        f16, 12(a1)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    lwc1        f10, 16(a1)
+    lwc1        f12, 20(a1)
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    lwc1        f14, 24(a1)
+    lwc1        f16, 28(a1)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    lwc1        f2, 16(a2)
+    lwc1        f4, 20(a2)
+    lwc1        f6, 24(a2)
+    lwc1        f8, 28(a2)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    sh          t1, 0(a0)
+    sh          t2, 2(a0)
+    sh          t3, 4(a0)
+    sh          t4, 6(a0)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    addiu       t0, t0, -8
+    addiu       a2, a2, 32
+    addiu       a1, a1, 32
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    sh          t1, 8(a0)
+    sh          t2, 10(a0)
+    sh          t3, 12(a0)
+    sh          t4, 14(a0)
+    bgez        t0, 0b
+     addiu      a0, a0, 16
+
+    j           ra
+     nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu       sp, sp, -40
+    move        v0, sp
+    addiu       s2, zero, 29692
+    addiu       s3, zero, -10426
+    addiu       s4, zero, 6967
+    addiu       s5, zero, -5906
+    lh          t0, 0(a1)       /* t0 = inptr[DCTSIZE*0] */
+    lh          t5, 0(a0)       /* t5 = quantptr[DCTSIZE*0] */
+    lh          t1, 48(a1)      /* t1 = inptr[DCTSIZE*3] */
+    lh          t6, 48(a0)      /* t6 = quantptr[DCTSIZE*3] */
+    mul         t4, t5, t0
+    lh          t0, 16(a1)      /* t0 = inptr[DCTSIZE*1] */
+    lh          t5, 16(a0)      /* t5 = quantptr[DCTSIZE*1] */
+    mul         t6, t6, t1
+    mul         t5, t5, t0
+    lh          t2, 80(a1)      /* t2 = inptr[DCTSIZE*5] */
+    lh          t7, 80(a0)      /* t7 = quantptr[DCTSIZE*5] */
+    lh          t3, 112(a1)     /* t3 = inptr[DCTSIZE*7] */
+    lh          t8, 112(a0)     /* t8 = quantptr[DCTSIZE*7] */
+    mul         t7, t7, t2
+    mult        zero, zero
+    mul         t8, t8, t3
+    li          s0, 0x73FCD746  /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+    li          s1, 0x1B37E8EE  /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+    ins         t6, t5, 16, 16  /* t6 = t5|t6 */
+    sll         t4, t4, 15
+    dpa.w.ph    $ac0, t6, s0
+    lh          t1, 2(a1)
+    lh          t6, 2(a0)
+    ins         t8, t7, 16, 16  /* t8 = t7|t8 */
+    dpa.w.ph    $ac0, t8, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 18(a1)
+    lh          t6, 18(a0)
+    lh          t2, 50(a1)
+    lh          t7, 50(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 82(a1)
+    lh          t2, 82(a0)
+    lh          t3, 114(a1)
+    lh          t4, 114(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 0(v0)
+    sw          t8, 20(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 6(a1)
+    lh          t6, 6(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 22(a1)
+    lh          t6, 22(a0)
+    lh          t2, 54(a1)
+    lh          t7, 54(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 86(a1)
+    lh          t2, 86(a0)
+    lh          t3, 118(a1)
+    lh          t4, 118(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 4(v0)
+    sw          t8, 24(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 10(a1)
+    lh          t6, 10(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 26(a1)
+    lh          t6, 26(a0)
+    lh          t2, 58(a1)
+    lh          t7, 58(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 90(a1)
+    lh          t2, 90(a0)
+    lh          t3, 122(a1)
+    lh          t4, 122(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 8(v0)
+    sw          t8, 28(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 14(a1)
+    lh          t6, 14(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 30(a1)
+    lh          t6, 30(a0)
+    lh          t2, 62(a1)
+    lh          t7, 62(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 94(a1)
+    lh          t2, 94(a0)
+    lh          t3, 126(a1)
+    lh          t4, 126(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 12(v0)
+    sw          t8, 32(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    lw          t9, 0(a2)
+    lw          t3, 0(v0)
+    lw          t7, 4(v0)
+    lw          t1, 8(v0)
+    addu        t9, t9, a3
+    sll         t3, t3, 15
+    subu        t8, t4, t0
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    shra_r.w    t8, t8, 13
+    sw          t0, 16(v0)
+    sw          t8, 36(v0)
+    lw          t5, 12(v0)
+    lw          t6, 16(v0)
+    mult        t7, s2
+    madd        t1, s3
+    madd        t5, s4
+    madd        t6, s5
+    lw          t5, 24(v0)
+    lw          t7, 28(v0)
+    mflo        t0, $ac0
+    lw          t8, 32(v0)
+    lw          t2, 36(v0)
+    mult        $ac1, t5, s2
+    madd        $ac1, t7, s3
+    madd        $ac1, t8, s4
+    madd        $ac1, t2, s5
+    addu        t1, t3, t0
+    subu        t6, t3, t0
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    mflo        t4, $ac1
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    lw          t0, 20(v0)
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    sll         t0, t0, 15
+    lw          t9, 4(a2)
+    addu        t1, t0, t4
+    subu        t6, t0, t4
+    addu        t9, t9, a3
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    addiu       sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j           ra
+     nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0     = compptr->dct_table
+ * a1     = coef_block
+ * a2     = output_buf
+ * a3     = output_col
+ * 16(sp) = workspace[DCTSIZE*4]  (buffers data between passes)
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          v1, 48(sp)
+    move        t0, a1
+    move        t1, v1
+    li          t9, 4
+    li          s0, 0x2e75f93e
+    li          s1, 0x21f9ba79
+    li          s2, 0xecc2efb0
+    li          s3, 0x52031ccd
+
+0:
+    lh          s6, 32(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 32(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 96(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 96(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 0(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 0(a0)       /* quantptr[0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 112(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 112(a0)     /* quantptr[DCTSIZE*7] */
+    lh          v0, 80(t0)      /* inptr[DCTSIZE*5] */
+    lh          s5, 80(a0)      /* quantptr[DCTSIZE*5] */
+    lh          s6, 48(a0)      /* quantptr[DCTSIZE*3] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s7, 16(a0)      /* quantptr[DCTSIZE*1] */
+    lh          t8, 16(t0)      /* inptr[DCTSIZE*1] */
+    subu        t6, t6, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 48(t0)      /* inptr[DCTSIZE*3] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         v0, s5, v0      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, t6      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, t6      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, v0, 16, 16
+    ins         t7, t8, 16, 16
+    addiu       t9, t9, -1
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        s4, $ac0
+    mflo        s5, $ac1
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addiu       t0, t0, 2
+    addu        t6, t4, s4
+    subu        t5, t4, s4
+    addu        s6, t3, s5
+    subu        s7, t3, s5
+    shra_r.w    t6, t6, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    t5, t5, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          t6, 28(t1)
+    sw          t5, 60(t1)
+    sw          s6, -4(t1)
+    bgtz        t9, 0b
+     sw         s7, 92(t1)
+    /* second loop three pass */
+    li          t9, 3
+1:
+    lh          s6, 34(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 34(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 98(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 98(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 2(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 2(a0)       /* quantptr[DCTSIZE*0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         v0, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 114(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 114(a0)     /* quantptr[DCTSIZE*7] */
+    lh          s5, 82(a0)      /* quantptr[DCTSIZE*5] */
+    lh          t6, 82(t0)      /* inptr[DCTSIZE*5] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s6, 50(a0)      /* quantptr[DCTSIZE*3] */
+    lh          t8, 18(t0)      /* inptr[DCTSIZE*1] */
+    subu        v0, v0, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 50(t0)      /* inptr[DCTSIZE*3] */
+    lh          s7, 18(a0)      /* quantptr[DCTSIZE*1] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         t6, s5, t6      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, v0      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, v0      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        t5, $ac0
+    mflo        t6, $ac1
+    addiu       t9, t9, -1
+    addiu       t0, t0, 2
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addu        s5, t4, t5
+    subu        s4, t4, t5
+    addu        s6, t3, t6
+    subu        s7, t3, t6
+    shra_r.w    s5, s5, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    s4, s4, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          s5, 32(t1)
+    sw          s4, 64(t1)
+    sw          s6, 0(t1)
+    bgtz        t9, 1b
+     sw         s7, 96(t1)
+    move        t1, v1
+    li          s4, 15137
+    lw          s6, 8(t1)       /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 24(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 0(t1)       /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 28(t1)      /* wsptr[7] */
+    lh          t6, 20(t1)      /* wsptr[5] */
+    lh          t7, 12(t1)      /* wsptr[3] */
+    lh          t8, 4(t1)       /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 0(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 2 */
+    li          s4, 15137
+    lw          s6, 40(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 56(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 32(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 60(t1)      /* wsptr[7] */
+    lh          t6, 52(t1)      /* wsptr[5] */
+    lh          t7, 44(t1)      /* wsptr[3] */
+    lh          t8, 36(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    sll         s4, t9, 2
+    lw          v0, 4(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 3 */
+    li          s4, 15137
+    lw          s6, 72(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 88(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 64(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 92(t1)      /* wsptr[7] */
+    lh          t6, 84(t1)      /* wsptr[5] */
+    lh          t7, 76(t1)      /* wsptr[3] */
+    lh          t8, 68(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 8(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    li          s4, 15137
+    lw          s6, 104(t1)     /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 120(t1)     /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 96(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 124(t1)     /* wsptr[7] */
+    lh          t6, 116(t1)     /* wsptr[5] */
+    lh          t7, 108(t1)     /* wsptr[3] */
+    lh          t8, 100(t1)     /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2; */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2; */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 12(a2)      /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -144
+    move        v0, sp
+    addiu       v1, v0, 24
+    addiu       t9, zero, 5793
+    addiu       s0, zero, 10033
+    addiu       s1, zero, 2998
+
+1:
+    lh          s2, 0(a0)       /* q0 = quantptr[ 0] */
+    lh          s3, 32(a0)      /* q1 = quantptr[16] */
+    lh          s4, 64(a0)      /* q2 = quantptr[32] */
+    lh          t2, 64(a1)      /* tmp2 = inptr[32] */
+    lh          t1, 32(a1)      /* tmp1 = inptr[16] */
+    lh          t0, 0(a1)       /* tmp0 = inptr[ 0] */
+    mul         t2, t2, s4      /* tmp2 = tmp2 * q2 */
+    mul         t1, t1, s3      /* tmp1 = tmp1 * q1 */
+    mul         t0, t0, s2      /* tmp0 = tmp0 * q0 */
+    lh          t6, 16(a1)      /* z1 = inptr[ 8] */
+    lh          t8, 80(a1)      /* z3 = inptr[40] */
+    lh          t7, 48(a1)      /* z2 = inptr[24] */
+    lh          s2, 16(a0)      /* q0 = quantptr[ 8] */
+    lh          s4, 80(a0)      /* q2 = quantptr[40] */
+    lh          s3, 48(a0)      /* q1 = quantptr[24] */
+    mul         t2, t2, t9      /* tmp2 = tmp2 * 5793 */
+    mul         t1, t1, s0      /* tmp1 = tmp1 * 10033 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    mul         t6, t6, s2      /* z1 = z1 * q0 */
+    mul         t8, t8, s4      /* z3 = z3 * q2 */
+    mul         t7, t7, s3      /* z2 = z2 * q1 */
+    addu        t3, t0, t2      /* tmp10 = tmp0 + tmp2 */
+    sll         t2, t2, 1       /* tmp2 = tmp2 << 2 */
+    subu        t4, t0, t2      /* tmp11 = tmp0 - tmp2; */
+    subu        t5, t3, t1      /* tmp12 = tmp10 - tmp1 */
+    addu        t3, t3, t1      /* tmp10 = tmp10 + tmp1 */
+    addu        t1, t6, t8      /* tmp1 = z1 + z3 */
+    mul         t1, t1, s1      /* tmp1 = tmp1 * 2998 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    subu        t2, t6, t8      /* tmp2 = z1 - z3 */
+    subu        t2, t2, t7      /* tmp2 = tmp2 - z2 */
+    sll         t2, t2, 2       /* tmp2 = tmp2 << 2 */
+    addu        t0, t6, t7      /* tmp0 = z1 + z2 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    subu        s2, t8, t7      /* q0 = z3 - z2 */
+    sll         s2, s2, 13      /* q0 = q0 << 13 */
+    addu        t0, t0, t1      /* tmp0 = tmp0 + tmp1 */
+    addu        t1, s2, t1      /* tmp1 = q0 + tmp1 */
+    addu        s2, t4, t2      /* q0 = tmp11 + tmp2 */
+    subu        s3, t4, t2      /* q1 = tmp11 - tmp2 */
+    addu        t6, t3, t0      /* z1 = tmp10 + tmp0 */
+    subu        t7, t3, t0      /* z2 = tmp10 - tmp0 */
+    addu        t4, t5, t1      /* tmp11 = tmp12 + tmp1 */
+    subu        t5, t5, t1      /* tmp12 = tmp12 - tmp1 */
+    shra_r.w    t6, t6, 11      /* z1 = (z1 + 1024) >> 11 */
+    shra_r.w    t7, t7, 11      /* z2 = (z2 + 1024) >> 11 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    shra_r.w    t5, t5, 11      /* tmp12 = (tmp12 + 1024) >> 11 */
+    sw          s2, 24(v0)
+    sw          s3, 96(v0)
+    sw          t6, 0(v0)
+    sw          t7, 120(v0)
+    sw          t4, 48(v0)
+    sw          t5, 72(v0)
+    addiu       v0, v0, 4
+    addiu       a1, a1, 2
+    bne         v0, v1, 1b
+     addiu      a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move        v0, sp
+    addiu       v1, v0, 144
+
+2:
+    lw          t0, 0(v0)
+    lw          t2, 16(v0)
+    lw          s5, 0(a2)
+    addiu       t0, t0, 16
+    sll         t0, t0, 13
+    mul         t3, t2, t9
+    lw          t6, 4(v0)
+    lw          t8, 20(v0)
+    lw          t7, 12(v0)
+    addu        s5, s5, a3
+    addu        s6, t6, t8
+    mul         s6, s6, s1
+    addu        t1, t0, t3
+    subu        t4, t0, t3
+    subu        t4, t4, t3
+    lw          t3, 8(v0)
+    mul         t0, t3, s0
+    addu        s7, t6, t7
+    sll         s7, s7, 13
+    addu        s7, s6, s7
+    subu        t2, t8, t7
+    sll         t2, t2, 13
+    addu        t2, s6, t2
+    subu        s6, t6, t7
+    subu        s6, s6, t8
+    sll         s6, s6, 13
+    addu        t3, t1, t0
+    subu        t5, t1, t0
+    addu        t6, t3, s7
+    subu        t3, t3, s7
+    addu        t7, t4, s6
+    subu        t4, t4, s6
+    addu        t8, t5, t2
+    subu        t5, t5, t2
+    shll_s.w    t6, t6, 6
+    shll_s.w    t3, t3, 6
+    shll_s.w    t7, t7, 6
+    shll_s.w    t4, t4, 6
+    shll_s.w    t8, t8, 6
+    shll_s.w    t5, t5, 6
+    sra         t6, t6, 24
+    addiu       t6, t6, 128
+    sra         t3, t3, 24
+    addiu       t3, t3, 128
+    sb          t6, 0(s5)
+    sra         t7, t7, 24
+    addiu       t7, t7, 128
+    sb          t3, 5(s5)
+    sra         t4, t4, 24
+    addiu       t4, t4, 128
+    sb          t7, 1(s5)
+    sra         t8, t8, 24
+    addiu       t8, t8, 128
+    sb          t4, 4(s5)
+    addiu       v0, v0, 24
+    sra         t5, t5, 24
+    addiu       t5, t5, 128
+    sb          t8, 2(s5)
+    addiu       a2, a2,  4
+    bne         v0, v1, 2b
+     sb         t5, 3(s5)
+
+    addiu       sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 8
+
+1:
+    /* odd part */
+    lh          t0, 48(a1)
+    lh          t1, 48(a0)
+    lh          t2, 16(a1)
+    lh          t3, 16(a0)
+    lh          t4, 80(a1)
+    lh          t5, 80(a0)
+    lh          t6, 112(a1)
+    lh          t7, 112(a0)
+    mul         t0, t0, t1      /* z2 */
+    mul         t1, t2, t3      /* z1 */
+    mul         t2, t4, t5      /* z3 */
+    mul         t3, t6, t7      /* z4 */
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    li          t6, 7053        /* FIX(0.860918669) */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t7, t1, t2      /* tmp10 */
+    addu        t8, t7, t3      /* tmp10 + z4 */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2139        /* FIX(0.261052384) */
+    mul         t8, t7, t8      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    li          t7, 2295        /* FIX(0.280143716) */
+    mul         t7, t1, t7      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1-=z4 */
+    subu        t0, t0, t2      /* z2-=z3 */
+    addu        t2, t0, t1      /* z1+z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_0_765366865 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t8, t6, t8      /* tmp12 */
+    addu        t3, t8, t4      /* tmp12 + tmp11 */
+    addu        t3, t3, t7      /* tmp10 */
+    subu        t8, t8, t9      /* tmp12 + tmp13 */
+    addu        s0, t5, s0
+    subu        t8, t8, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        s1, s1, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    /* even part start */
+    lh          t4, 64(a1)
+    lh          t5, 64(a0)
+    lh          t7, 32(a1)
+    lh          s0, 32(a0)
+    lh          s1, 0(a1)
+    lh          s2, 0(a0)
+    lh          s3, 96(a1)
+    lh          v0, 96(a0)
+    mul         t4, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                              quantptr[DCTSIZE*4]) */
+    mul         t5, t7, s0      /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                              quantptr[DCTSIZE*2]) */
+    mul         t7, s1, s2      /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                              quantptr[DCTSIZE*0]) */
+    mul         s0, s3, v0      /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                              quantptr[DCTSIZE*6]) */
+    /* odd part end */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* update counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 2
+    addiu       a1, a1, 2
+    /* even part rest */
+    li          s1, 10033
+    li          s2, 11190
+    mul         t4, t4, s1      /* z4 */
+    mul         s1, t5, s2      /* z4 */
+    sll         t5, t5, 13      /* z1 */
+    sll         t7, t7, 13
+    addiu       t7, t7, 1024    /* z3 */
+    sll         s0, s0, 13      /* z2 */
+    addu        s2, t7, t4      /* tmp10 */
+    subu        t4, t7, t4      /* tmp11 */
+    subu        s3, t5, s0      /* tmp12 */
+    addu        t2, t7, s3      /* tmp21 */
+    subu        s3, t7, s3      /* tmp24 */
+    addu        t7, s1, s0      /* tmp12 */
+    addu        v0, s2, t7      /* tmp20 */
+    subu        s2, s2, t7      /* tmp25 */
+    subu        s1, s1, t5      /* z4 - z1 */
+    subu        s1, s1, s0      /* tmp12 */
+    addu        s0, t4, s1      /* tmp22 */
+    subu        t4, t4, s1      /* tmp23 */
+    /* final output stage */
+    addu        t5, v0, t3
+    subu        v0, v0, t3
+    addu        t3, t2, t1
+    subu        t2, t2, t1
+    addu        t1, s0, t8
+    subu        s0, s0, t8
+    addu        t8, t4, t9
+    subu        t4, t4, t9
+    addu        t9, s3, t0
+    subu        s3, s3, t0
+    addu        t0, s2, t6
+    subu        s2, s2, t6
+    sra         t5, t5, 11
+    sra         t3, t3, 11
+    sra         t1, t1, 11
+    sra         t8, t8, 11
+    sra         t9, t9, 11
+    sra         t0, t0, 11
+    sra         s2, s2, 11
+    sra         s3, s3, 11
+    sra         t4, t4, 11
+    sra         s0, s0, 11
+    sra         t2, t2, 11
+    sra         v0, v0, 11
+    sw          t5, 0(a2)
+    sw          t3, 32(a2)
+    sw          t1, 64(a2)
+    sw          t8, 96(a2)
+    sw          t9, 128(a2)
+    sw          t0, 160(a2)
+    sw          s2, 192(a2)
+    sw          s3, 224(a2)
+    sw          t4, 256(a2)
+    sw          s0, 288(a2)
+    sw          t2, 320(a2)
+    sw          v0, 352(a2)
+    bgtz        a3, 1b
+     addiu      a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 12
+
+1:
+    /* Odd part */
+    lw          t0, 12(a0)
+    lw          t1, 4(a0)
+    lw          t2, 20(a0)
+    lw          t3, 28(a0)
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t6, t1, t2      /* tmp10 */
+    li          t7, 2139        /* FIX(0.261052384) */
+    mul         t7, t6, t7      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    addu        t6, t6, t3      /* tmp10 + z4 */
+    li          t8, 7053        /* FIX(0.860918669) */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2295        /* FIX(0.280143716) */
+    mul         t8, t1, t8      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242)) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1 -= z4 */
+    subu        t0, t0, t2      /* z2 -= z3 */
+    addu        t2, t1, t0      /* z1 + z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_1_847759065 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t3, t6, t7      /* tmp12 */
+    addu        t7, t3, t4
+    addu        t7, t7, t8      /* tmp10 */
+    subu        t3, t3, t9
+    subu        t3, t3, t5
+    subu        t3, t3, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        t9, t9, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* even part */
+    lw          t2, 16(a0)      /* z4 */
+    lw          t4, 8(a0)       /* z1 */
+    lw          t5, 0(a0)       /* z3 */
+    lw          t8, 24(a0)      /* z2 */
+    li          s0, 10033       /* FIX(1.224744871) */
+    li          s1, 11190       /* FIX(1.366025404) */
+    mul         t2, t2, s0      /* z4 */
+    mul         s0, t4, s1      /* z4 */
+    addiu       t5, t5, 0x10
+    sll         t5, t5, 13      /* z3 */
+    sll         t4, t4, 13      /* z1 */
+    sll         t8, t8, 13      /* z2 */
+    subu        s1, t4, t8      /* tmp12 */
+    addu        s2, t5, t2      /* tmp10 */
+    subu        t2, t5, t2      /* tmp11 */
+    addu        s3, t5, s1      /* tmp21 */
+    subu        s1, t5, s1      /* tmp24 */
+    addu        t5, s0, t8      /* tmp12 */
+    addu        v0, s2, t5      /* tmp20 */
+    subu        t5, s2, t5      /* tmp25 */
+    subu        t4, s0, t4
+    subu        t4, t4, t8      /* tmp12 */
+    addu        t8, t2, t4      /* tmp22 */
+    subu        t2, t2, t4      /* tmp23 */
+    /* increment counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 32
+    /* Final stage */
+    addu        t4, v0, t7
+    subu        v0, v0, t7
+    addu        t7, s3, t1
+    subu        s3, s3, t1
+    addu        t1, t8, t3
+    subu        t8, t8, t3
+    addu        t3, t2, t9
+    subu        t2, t2, t9
+    addu        t9, s1, t0
+    subu        s1, s1, t0
+    addu        t0, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 4
+    sll         t7, t7, 4
+    sll         t1, t1, 4
+    sll         t3, t3, 4
+    sll         t9, t9, 4
+    sll         t0, t0, 4
+    sll         t5, t5, 4
+    sll         s1, s1, 4
+    sll         t2, t2, 4
+    sll         t8, t8, 4
+    sll         s3, s3, 4
+    sll         v0, v0, 4
+    shll_s.w    t4, t4, 2
+    shll_s.w    t7, t7, 2
+    shll_s.w    t1, t1, 2
+    shll_s.w    t3, t3, 2
+    shll_s.w    t9, t9, 2
+    shll_s.w    t0, t0, 2
+    shll_s.w    t5, t5, 2
+    shll_s.w    s1, s1, 2
+    shll_s.w    t2, t2, 2
+    shll_s.w    t8, t8, 2
+    shll_s.w    s3, s3, 2
+    shll_s.w    v0, v0, 2
+    srl         t4, t4, 24
+    srl         t7, t7, 24
+    srl         t1, t1, 24
+    srl         t3, t3, 24
+    srl         t9, t9, 24
+    srl         t0, t0, 24
+    srl         t5, t5, 24
+    srl         s1, s1, 24
+    srl         t2, t2, 24
+    srl         t8, t8, 24
+    srl         s3, s3, 24
+    srl         v0, v0, 24
+    lw          t6, 0(a1)
+    addiu       t4, t4, 0x80
+    addiu       t7, t7, 0x80
+    addiu       t1, t1, 0x80
+    addiu       t3, t3, 0x80
+    addiu       t9, t9, 0x80
+    addiu       t0, t0, 0x80
+    addiu       t5, t5, 0x80
+    addiu       s1, s1, 0x80
+    addiu       t2, t2, 0x80
+    addiu       t8, t8, 0x80
+    addiu       s3, s3, 0x80
+    addiu       v0, v0, 0x80
+    sb          t4, 0(t6)
+    sb          t7, 1(t6)
+    sb          t1, 2(t6)
+    sb          t3, 3(t6)
+    sb          t9, 4(t6)
+    sb          t0, 5(t6)
+    sb          t5, 6(t6)
+    sb          s1, 7(t6)
+    sb          t2, 8(t6)
+    sb          t8, 9(t6)
+    sb          s3, 10(t6)
+    sb          v0, 11(t6)
+    bgtz        a3, 1b
+     addiu      a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr          ra
+     nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    lw            t0, 0(a0)
+    li            t7, 0xff80ff80
+    addu          t0, t0, a1
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    lw            t0, 4(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 0(a2)
+    usw           t4, 4(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 8(a2)
+    usw           t6, 12(a2)
+
+    lw            t0, 8(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 16(a2)
+    usw           t4, 20(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 24(a2)
+    usw           t6, 28(a2)
+
+    lw            t0, 12(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 32(a2)
+    usw           t4, 36(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 40(a2)
+    usw           t6, 44(a2)
+
+    lw            t0, 16(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 48(a2)
+    usw           t4, 52(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 56(a2)
+    usw           t6, 60(a2)
+
+    lw            t0, 20(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 64(a2)
+    usw           t4, 68(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 72(a2)
+    usw           t6, 76(a2)
+
+    lw            t0, 24(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 80(a2)
+    usw           t4, 84(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 88(a2)
+    usw           t6, 92(a2)
+
+    lw            t0, 28(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 96(a2)
+    usw           t4, 100(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 104(a2)
+    usw           t6, 108(a2)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 112(a2)
+    usw           t4, 116(a2)
+    usw           t5, 120(a2)
+    usw           t6, 124(a2)
+
+    j             ra
+     nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    .set at
+
+    lw          t0, 0(a0)
+    addu        t0, t0, a1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 4(a0)
+    swc1        f2, 0(a2)
+    swc1        f4, 4(a2)
+    swc1        f6, 8(a2)
+    addu        t0, t0, a1
+    swc1        f8, 12(a2)
+    swc1        f10, 16(a2)
+    swc1        f12, 20(a2)
+    swc1        f14, 24(a2)
+    swc1        f16, 28(a2)
+    /* elemr 1 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 8(a0)
+    swc1        f2, 32(a2)
+    swc1        f4, 36(a2)
+    swc1        f6, 40(a2)
+    addu        t0, t0, a1
+    swc1        f8, 44(a2)
+    swc1        f10, 48(a2)
+    swc1        f12, 52(a2)
+    swc1        f14, 56(a2)
+    swc1        f16, 60(a2)
+    /* elemr 2 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 12(a0)
+    swc1        f2, 64(a2)
+    swc1        f4, 68(a2)
+    swc1        f6, 72(a2)
+    addu        t0, t0, a1
+    swc1        f8, 76(a2)
+    swc1        f10, 80(a2)
+    swc1        f12, 84(a2)
+    swc1        f14, 88(a2)
+    swc1        f16, 92(a2)
+    /*  elemr 3 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 16(a0)
+    swc1        f2, 96(a2)
+    swc1        f4, 100(a2)
+    swc1        f6, 104(a2)
+    addu        t0, t0, a1
+    swc1        f8, 108(a2)
+    swc1        f10, 112(a2)
+    swc1        f12, 116(a2)
+    swc1        f14, 120(a2)
+    swc1        f16, 124(a2)
+    /* elemr 4 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 20(a0)
+    swc1        f2, 128(a2)
+    swc1        f4, 132(a2)
+    swc1        f6, 136(a2)
+    addu        t0, t0, a1
+    swc1        f8, 140(a2)
+    swc1        f10, 144(a2)
+    swc1        f12, 148(a2)
+    swc1        f14, 152(a2)
+    swc1        f16, 156(a2)
+    /* elemr 5 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 24(a0)
+    swc1        f2, 160(a2)
+    swc1        f4, 164(a2)
+    swc1        f6, 168(a2)
+    addu        t0, t0, a1
+    swc1        f8, 172(a2)
+    swc1        f10, 176(a2)
+    swc1        f12, 180(a2)
+    swc1        f14, 184(a2)
+    swc1        f16, 188(a2)
+    /* elemr 6 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 28(a0)
+    swc1        f2, 192(a2)
+    swc1        f4, 196(a2)
+    swc1        f6, 200(a2)
+    addu        t0, t0, a1
+    swc1        f8, 204(a2)
+    swc1        f10, 208(a2)
+    swc1        f12, 212(a2)
+    swc1        f14, 216(a2)
+    swc1        f16, 220(a2)
+    /* elemr 7 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    swc1        f2, 224(a2)
+    swc1        f4, 228(a2)
+    swc1        f6, 232(a2)
+    swc1        f8, 236(a2)
+    swc1        f10, 240(a2)
+    swc1        f12, 244(a2)
+    swc1        f14, 248(a2)
+    swc1        f16, 252(a2)
+
+    j           ra
+     nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero  $0
+#define AT    $1
+#define v0    $2
+#define v1    $3
+#define a0    $4
+#define a1    $5
+#define a2    $6
+#define a3    $7
+#define t0    $8
+#define t1    $9
+#define t2    $10
+#define t3    $11
+#define t4    $12
+#define t5    $13
+#define t6    $14
+#define t7    $15
+#define s0    $16
+#define s1    $17
+#define s2    $18
+#define s3    $19
+#define s4    $20
+#define s5    $21
+#define s6    $22
+#define s7    $23
+#define t8    $24
+#define t9    $25
+#define k0    $26
+#define k1    $27
+#define gp    $28
+#define sp    $29
+#define fp    $30
+#define s8    $30
+#define ra    $31
+
+#define f0    $f0
+#define f1    $f1
+#define f2    $f2
+#define f3    $f3
+#define f4    $f4
+#define f5    $f5
+#define f6    $f6
+#define f7    $f7
+#define f8    $f8
+#define f9    $f9
+#define f10   $f10
+#define f11   $f11
+#define f12   $f12
+#define f13   $f13
+#define f14   $f14
+#define f15   $f15
+#define f16   $f16
+#define f17   $f17
+#define f18   $f18
+#define f19   $f19
+#define f20   $f20
+#define f21   $f21
+#define f22   $f22
+#define f23   $f23
+#define f24   $f24
+#define f25   $f25
+#define f26   $f26
+#define f27   $f27
+#define f28   $f28
+#define f29   $f29
+#define f30   $f30
+#define f31   $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+    .globl      symbol; \
+    HIDDEN_SYMBOL(symbol) \
+    .align      2; \
+    .type       symbol, @function; \
+    .ent        symbol, 0; \
+symbol: \
+    .frame      sp, 0, ra; \
+    .set        push; \
+    .set        arch = mips32r2; \
+    .set        noreorder; \
+    .set        noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+    .set        dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+    .set        pop; \
+    .end        function; \
+    .size       function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
+                           r2  = 0, r3  = 0, r4  = 0, \
+                           r5  = 0, r6  = 0, r7  = 0, \
+                           r8  = 0, r9  = 0, r10 = 0, \
+                           r11 = 0, r12 = 0, r13 = 0, \
+                           r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, -\stack_offset
+.endif
+    sw          \r1, 0(sp)
+.if \r2 != 0
+    sw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    sw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    sw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw          \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
+                                r2  = 0, r3  = 0, r4  = 0, \
+                                r5  = 0, r6  = 0, r7  = 0, \
+                                r8  = 0, r9  = 0, r10 = 0, \
+                                r11 = 0, r12 = 0, r13 = 0, \
+                                r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+    lw          \r1, 0(sp)
+.if \r2 != 0
+    lw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    lw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    lw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw          \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, \stack_offset
+.endif
+.endm
diff --git a/media/libjpeg/simd/mips64/jccolext-mmi.c b/media/libjpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jccolor-mmi.c b/media/libjpeg/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081  ((short)5329)                /* FIX(0.08131) */
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_168  ((short)11059)               /* FIX(0.16874) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_331  ((short)21709)               /* FIX(0.33126) */
+#define F_0_418  ((short)27439)               /* FIX(0.41869) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250,
+  index_PW_MF016_MF033,
+  index_PW_MF008_MF041,
+  index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+  _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+  _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+  _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+                   ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033   get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041   get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ  get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgray-mmi.c b/media/libjpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgryext-mmi.c b/media/libjpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jcsample-mmi.c b/media/libjpeg/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/mips64/jcsample.h b/media/libjpeg/simd/mips64/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/mips64/jdcolext-mmi.c b/media/libjpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdcolor-mmi.c b/media/libjpeg/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jdmerge-mmi.c b/media/libjpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/media/libjpeg/simd/mips64/jdmrgext-mmi.c b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdsample-mmi.c b/media/libjpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/media/libjpeg/simd/mips64/jfdctfst-mmi.c b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jfdctint-mmi.c b/media/libjpeg/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298  ((short)2446)   /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)   /* FIX(0.390180644) */
+#define FIX_0_541  ((short)4433)   /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)   /* FIX(0.765366865) */
+#define FIX_0_899  ((short)7373)   /* FIX(0.899976223) */
+#define FIX_1_175  ((short)9633)   /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299)  /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137)  /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069)  /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819)  /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995)  /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172)  /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+                   (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X  get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+  __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+  __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * out2 = z1 + tmp13 * 0.765366865; \
+   * out6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+  \
+  out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+  out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+  out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+  out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_pi16(tmp4, tmp6); \
+  z4 = _mm_add_pi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * out7 = tmp4 + z1 + z3;  out5 = tmp5 + z2 + z4; \
+   * out3 = tmp6 + z2 + z3;  out1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * out7 = tmp4 + z3;  out5 = tmp5 + z4; \
+   * out3 = tmp6 + z3;  out1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+  \
+  tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+  tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+  tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+  tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+  \
+  out7l = _mm_add_pi32(tmp4l, z3l); \
+  out7h = _mm_add_pi32(tmp4h, z3h); \
+  out1l = _mm_add_pi32(tmp7l, z4l); \
+  out1h = _mm_add_pi32(tmp7h, z4h); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+  \
+  tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+  tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+  tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+  tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+  \
+  out5l = _mm_add_pi32(tmp5l, z4l); \
+  out5h = _mm_add_pi32(tmp5h, z4h); \
+  out3l = _mm_add_pi32(tmp6l, z3l); \
+  out3h = _mm_add_pi32(tmp6h, z3h); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  __m64 tmp10, tmp11; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+  out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  __m64 tmp10, tmp11; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  \
+  out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+  out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp12, tmp13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jidctfst-mmi.c b/media/libjpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jidctint-mmi.c b/media/libjpeg/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE  128
+
+#define FIX_0_298  ((short)2446)  /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)  /* FIX(0.390180644) */
+#define FIX_0_899  ((short)7373)  /* FIX(0.899976223) */
+#define FIX_0_541  ((short)4433)  /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)  /* FIX(0.765366865) */
+#define FIX_1_175  ((short)9633)  /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+                  CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+  __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+  \
+  z3 = _mm_add_pi16(tmp0, tmp2); \
+  z4 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+  tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+  \
+  tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+  tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+  tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+  tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+  \
+  tmp0l = _mm_add_pi32(tmp0l, z3l); \
+  tmp0h = _mm_add_pi32(tmp0h, z3h); \
+  tmp3l = _mm_add_pi32(tmp3l, z4l); \
+  tmp3h = _mm_add_pi32(tmp3h, z4h); \
+  \
+  tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+  tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+  \
+  tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+  tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+  tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+  tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+  \
+  tmp1l = _mm_add_pi32(tmp1l, z4l); \
+  tmp1h = _mm_add_pi32(tmp1h, z4h); \
+  tmp2l = _mm_add_pi32(tmp2l, z3l); \
+  tmp2h = _mm_add_pi32(tmp2h, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_pi32(tmp10l, tmp3l); \
+  out0h = _mm_add_pi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+  out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+  out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_pi32(out0l, out0h); \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  \
+  out1l = _mm_add_pi32(tmp11l, tmp2l); \
+  out1h = _mm_add_pi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  out2l = _mm_add_pi32(tmp12l, tmp1l); \
+  out2h = _mm_add_pi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  \
+  out3l = _mm_add_pi32(tmp13l, tmp0l); \
+  out3h = _mm_add_pi32(tmp13h, tmp0h); \
+  \
+  out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+  out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+  out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+  out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 z23, z2, z3, z23l, z23h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l); \
+      dcval = _mm_slli_pi16(dcval, PASS1_BITS);  /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  z2 = _mm_mullo_pi16(col2l, quant2l); \
+  z3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  z23l = _mm_unpacklo_pi16(z2, z3); \
+  z23h = _mm_unpackhi_pi16(z2, z3); \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z2 = _mm_mullo_pi16(col0l, quant0l); \
+  z3 = _mm_mullo_pi16(col4l, quant4l); \
+  \
+  z23 = _mm_add_pi16(z2, z3); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(z2, z3); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+  tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+  \
+  DO_IDCT_COMMON(1) \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 z23, z23l, z23h; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+  __m64 col0, col1, col2, col3; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  z23l = _mm_unpacklo_pi16(row2l, row6l); \
+  z23h = _mm_unpackhi_pi16(row2l, row6l); \
+  \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z23 = _mm_add_pi16(row0l, row4l); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(row0l, row4l); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  tmp0 = row7l; \
+  tmp1 = row5l; \
+  tmp2 = row3l; \
+  tmp3 = row1l; \
+  \
+  DO_IDCT_COMMON(2) \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jquanti-mmi.c b/media/libjpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/media/libjpeg/simd/mips64/jsimd.c b/media/libjpeg/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..917440b43b
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd.c
@@ -0,0 +1,866 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
+  simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_ycc_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_ycc_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_ycc_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_ycc_extrgb_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_ycc_extrgbx_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_ycc_extbgr_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_ycc_extbgrx_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_ycc_extxbgr_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_ycc_extxrgb_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_ycc_rgb_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+                            compptr->v_samp_factor, compptr->width_in_blocks,
+                            input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips64/jsimd_mmi.h b/media/libjpeg/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
+
+#define SIZEOF_MMWORD  8
+#define BYTE_BIT  8
+#define WORD_BIT  16
+#define SCALEBITS  16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+  (((uint64_t)(uint8_t)a << 56) | \
+   ((uint64_t)(uint8_t)b << 48) | \
+   ((uint64_t)(uint8_t)c << 40) | \
+   ((uint64_t)(uint8_t)d << 32) | \
+   ((uint64_t)(uint8_t)e << 24) | \
+   ((uint64_t)(uint8_t)f << 16) | \
+   ((uint64_t)(uint8_t)g << 8)  | \
+   ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+  (((uint64_t)(uint16_t)a << 48) | \
+   ((uint64_t)(uint16_t)b << 32) | \
+   ((uint64_t)(uint16_t)c << 16) | \
+   ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+  (((uint64_t)(uint32_t)a << 32) | \
+   ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
diff --git a/media/libjpeg/simd/mips64/loongson-mmintrin.h b/media/libjpeg/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/media/libjpeg/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+   load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+  return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__b6 << 24) |
+                ((uint32_t)__b4 << 16) |
+                ((uint32_t)__b2 << 8) |
+                (uint32_t)__b0;
+  uint32_t hi = ((uint32_t)__b7 << 24) |
+                ((uint32_t)__b5 << 16) |
+                ((uint32_t)__b3 << 8) |
+                (uint32_t)__b1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklbh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklhw %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  } else if (__i1 == __i0) {
+    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+    __m64 ret;
+
+    asm("pshufh %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+       );
+
+    return ret;
+  } else {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+  __m64 ret;
+
+  asm("sll    $8, %1, 8\n\t"
+      "or     %1, %1, $8\n\t"
+      "mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__b0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+  __m64 ret;
+
+  asm("mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__h0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+  return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+  return _mm_set_pi8(__h7, __h6, __h5, __h4,
+                     __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+  return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaddhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+  int ret;
+
+  asm("pmovmskb %0, %1\n\t"
+      : "=r" (ret)
+      : "y" (__m1)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhuh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmullh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmuluw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psadbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pasubub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("biadd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("and %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("andn %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+  __m32 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("xor %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllh  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsll  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsrl  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrah %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psraw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsra %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+  return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+  return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmplth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsshb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packushb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+  __m64 ret;
+
+  asm("pextrh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__pos)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+  __m64 ret;
+
+  switch (__pos) {
+  case 0:
+
+    asm("pinsrh_0 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 1:
+
+    asm("pinsrh_1 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  case 2:
+
+    asm("pinsrh_2 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 3:
+
+    asm("pinsrh_3 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  }
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+  __m64 ret;
+
+  asm("pshufh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__n)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+   which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+   datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+  src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+  asm("swc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+  asm("sdc1 %1, %0 \n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+  __m32 ret;
+
+  asm("lwc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("ldc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif  /* __LOONGSON_MMINTRIN_H__ */
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA  mm0
+%define mmB  mm1
+%define xmmA  xmm0
+%define xmmB  xmm1
+%define ymmA  ymm0
+%define ymmB  ymm1
+%elif RGB_GREEN == 0
+%define mmA  mm2
+%define mmB  mm3
+%define xmmA  xmm2
+%define xmmB  xmm3
+%define ymmA  ymm2
+%define ymmB  ymm3
+%elif RGB_BLUE == 0
+%define mmA  mm4
+%define mmB  mm5
+%define xmmA  xmm4
+%define xmmB  xmm5
+%define ymmA  ymm4
+%define ymmB  ymm5
+%else
+%define mmA  mm6
+%define mmB  mm7
+%define xmmA  xmm6
+%define xmmB  xmm7
+%define ymmA  ymm6
+%define ymmB  ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC  mm0
+%define mmD  mm1
+%define xmmC  xmm0
+%define xmmD  xmm1
+%define ymmC  ymm0
+%define ymmD  ymm1
+%elif RGB_GREEN == 1
+%define mmC  mm2
+%define mmD  mm3
+%define xmmC  xmm2
+%define xmmD  xmm3
+%define ymmC  ymm2
+%define ymmD  ymm3
+%elif RGB_BLUE == 1
+%define mmC  mm4
+%define mmD  mm5
+%define xmmC  xmm4
+%define xmmD  xmm5
+%define ymmC  ymm4
+%define ymmD  ymm5
+%else
+%define mmC  mm6
+%define mmD  mm7
+%define xmmC  xmm6
+%define xmmD  xmm7
+%define ymmC  ymm6
+%define ymmD  ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE  mm0
+%define mmF  mm1
+%define xmmE  xmm0
+%define xmmF  xmm1
+%define ymmE  ymm0
+%define ymmF  ymm1
+%elif RGB_GREEN == 2
+%define mmE  mm2
+%define mmF  mm3
+%define xmmE  xmm2
+%define xmmF  xmm3
+%define ymmE  ymm2
+%define ymmF  ymm3
+%elif RGB_BLUE == 2
+%define mmE  mm4
+%define mmF  mm5
+%define xmmE  xmm4
+%define xmmF  xmm5
+%define ymmE  ymm4
+%define ymmF  ymm5
+%else
+%define mmE  mm6
+%define mmF  mm7
+%define xmmE  xmm6
+%define xmmF  xmm7
+%define ymmE  ymm6
+%define ymmF  ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG  mm0
+%define mmH  mm1
+%define xmmG  xmm0
+%define xmmH  xmm1
+%define ymmG  ymm0
+%define ymmH  ymm1
+%elif RGB_GREEN == 3
+%define mmG  mm2
+%define mmH  mm3
+%define xmmG  xmm2
+%define xmmH  xmm3
+%define ymmG  ymm2
+%define ymmH  ymm3
+%elif RGB_BLUE == 3
+%define mmG  mm4
+%define mmH  mm5
+%define xmmG  xmm4
+%define xmmH  xmm5
+%define ymmG  ymm4
+%define ymmH  ymm5
+%else
+%define mmG  mm6
+%define mmH  mm7
+%define xmmG  xmm6
+%define xmmH  xmm7
+%define ymmG  ymm6
+%define ymmH  ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
new file mode 100644
index 0000000000..9192f66f0c
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -0,0 +1,31 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+%define ROW(n, b, s)  ((b) + (n) * (s))
+%define COL(n, b, s)  ((b) + (n) * (s) * DCTSIZE)
+
+%define DWBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
new file mode 100644
index 0000000000..667024a5f9
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -0,0 +1,93 @@
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+;
+; -- jpeglib.h
+;
+%define DCTSIZE 8
+%define DCTSIZE2 64
+;
+; -- jmorecfg.h
+;
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define EXT_RGB_RED 0
+%define EXT_RGB_GREEN 1
+%define EXT_RGB_BLUE 2
+%define EXT_RGB_PIXELSIZE 3
+%define EXT_RGBX_RED 0
+%define EXT_RGBX_GREEN 1
+%define EXT_RGBX_BLUE 2
+%define EXT_RGBX_PIXELSIZE 4
+%define EXT_BGR_RED 2
+%define EXT_BGR_GREEN 1
+%define EXT_BGR_BLUE 0
+%define EXT_BGR_PIXELSIZE 3
+%define EXT_BGRX_RED 2
+%define EXT_BGRX_GREEN 1
+%define EXT_BGRX_BLUE 0
+%define EXT_BGRX_PIXELSIZE 4
+%define EXT_XBGR_RED 3
+%define EXT_XBGR_GREEN 2
+%define EXT_XBGR_BLUE 1
+%define EXT_XBGR_PIXELSIZE 4
+%define EXT_XRGB_RED 1
+%define EXT_XRGB_GREEN 2
+%define EXT_XRGB_BLUE 3
+%define EXT_XRGB_PIXELSIZE 4
+%define RGBX_FILLER_0XFF 1
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define CENTERJSAMPLE 128
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+;
+; -- jdct.h
+;
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define float FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
+; To maximize parallelism, Type short is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+;
+; -- jsimd.h
+;
+%define JSIMD_NONE 0x00
+%define JSIMD_MMX 0x01
+%define JSIMD_3DNOW 0x02
+%define JSIMD_SSE 0x04
+%define JSIMD_SSE2 0x08
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE   DCTSIZE
+%define _cpp_protection_DCTSIZE2  DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED             RGB_RED
+%define _cpp_protection_RGB_GREEN           RGB_GREEN
+%define _cpp_protection_RGB_BLUE            RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE       RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED         EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN       EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE        EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE   EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED        EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN      EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE       EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE  EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED         EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN       EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE        EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE   EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED        EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN      EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE       EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE  EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED        EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN      EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE       EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE  EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED        EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN      EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE       EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF  1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE            byte            ; unsigned char
+%define SIZEOF_JSAMPLE     SIZEOF_BYTE     ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE  CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF              word            ; short
+%define SIZEOF_JCOEF       SIZEOF_WORD     ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION         dword           ; unsigned int
+%define SIZEOF_JDIMENSION  SIZEOF_DWORD    ; sizeof(JDIMENSION)
+
+%define JSAMPROW           POINTER         ; JSAMPLE *     (jpeglib.h)
+%define JSAMPARRAY         POINTER         ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE         POINTER         ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR           POINTER         ; JCOEF *       (jpeglib.h)
+%define SIZEOF_JSAMPROW    SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY  SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE  SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR    SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word         ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD  ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32         ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32  ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word         ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD  ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word         ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD  ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2            ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32         ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32  ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE   JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX    JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW  JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE    JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2   JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2   JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT   .text  align=32
+%define SEG_CONST  .rdata align=32
+%else
+%define SEG_TEXT   .text  align=32 public use32 class=CODE
+%define SEG_CONST  .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT    .text  align=32
+%define SEG_CONST   .rdata align=32
+%else
+%define SEG_TEXT    .text  align=32 public use64 class=CODE
+%define SEG_CONST   .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name)  name                ; foo() -> foo
+
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT   _text align=32 public use32 class=CODE
+%define SEG_CONST  _data align=32 public use32 class=DATA
+
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT   .text   progbits align=32
+%define SEG_CONST  .rodata progbits align=32
+%else
+%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
+%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
+%define EXTN(name)  name                   ; foo() -> foo
+
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
+
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
+%define SEG_CONST  .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
+
+%else           ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+%endif          ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER         qword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER         dword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
+%endif
+
+%define INT             dword           ; signed integer type
+%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32            dword           ; IEEE754 single
+%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD          qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                         ; int128 (SSE register)
+%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD                         ; int256 (AVX register)
+%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
+%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
+
+%define BYTE_BIT      8                 ; CHAR_BIT in C
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)  _ %+ name           ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Hidden symbols
+;
+%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
+%define GLOBAL_DATA(name)      global EXTN(name):data hidden
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name)  global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name)      global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC  ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+    SECTION     SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym)  (got) + (sym) - const_base
+
+%imacro get_GOT 1
+    ; NOTE: this macro destroys ecx resister.
+    call        %%geteip
+    add         ecx, byte (%%ref - $)
+    jmp         short %%adjust
+%%geteip:
+    mov         ecx, POINTER [esp]
+    ret
+%%adjust:
+    push        ebp
+    xor         ebp, ebp                ; ebp = 0
+%ifidni %1, ebx  ; (%1 == ebx)
+    ; db 0x8D,0x9C + jmp near const_base =
+    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+    db          0x8D, 0x9C              ; 8D,9C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+    ; db 0x8D,0x8C + jmp near const_base =
+    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+    db          0x8D, 0x8C              ; 8D,8C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+    mov         %1, ecx
+%endif  ; (%1 == ebx)
+    pop         ebp
+%endmacro
+
+%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+    extern      GOT_SYMBOL
+    call        %%geteip
+    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+    jmp         short %%done
+%%geteip:
+    mov         %1, POINTER [esp]
+    ret
+%%done:
+%endmacro
+
+%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+    push        %1
+%endmacro
+%imacro poppic  1.nolist
+    pop         %1
+%endmacro
+%imacro movpic  2.nolist
+    mov         %1, %2
+%endmacro
+
+%else    ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym)  (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic  1.nolist
+%endmacro
+%imacro movpic  2.nolist
+%endmacro
+
+%endif   ;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+        db 0x90                                      ; nop
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+        db 0x8B, 0xED                                ; mov ebp,ebp
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+        db 0x90                                      ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+    align       %1, db 0                ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm6
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm7
+    mov         r10, rcx
+%if %1 > 1
+    mov         r11, rdx
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, r8
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, r9
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, [rax+48]
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, [rax+56]
+%endif
+    push        rsi
+    push        rdi
+%endmacro
+
+%imacro uncollect_args 1
+    pop         rdi
+    pop         rsi
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+    movaps      xmm7, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+    movaps      xmm6, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+    sub         rsp, %1 * SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+    add         rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+    push        r10
+    mov         r10, rdi
+%if %1 > 1
+    push        r11
+    mov         r11, rsi
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, rdx
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, rcx
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, r8
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+%if %1 > 1
+    pop         r11
+%endif
+    pop         r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/powerpc/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c
new file mode 100644
index 0000000000..170f90ff80
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c
@@ -0,0 +1,269 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+    rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = { 0 };
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+  __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+    pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+    pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+  __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) },
+    pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = VEC_LD(0, tmpbuf);
+          rgb1 = VEC_LD(16, tmpbuf);
+          rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = VEC_LD(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = VEC_LD(0, inptr);
+          if (num_cols > 16)
+            rgb1 = VEC_LD(16, inptr);
+          if (num_cols > 32)
+            rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = VEC_LD(48, inptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr0);
+
+      /* Calculate Cb values */
+      cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+                                   (__vector unsigned int)cb0);
+      cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+                                   (__vector unsigned int)cb1);
+      cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+                                   (__vector unsigned int)cb2);
+      cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+                                   (__vector unsigned int)cb3);
+      cbl = vec_perm((__vector unsigned short)cb0,
+                     (__vector unsigned short)cb1, shift_pack_index);
+      cbh = vec_perm((__vector unsigned short)cb2,
+                     (__vector unsigned short)cb3, shift_pack_index);
+      cb = vec_pack(cbl, cbh);
+      vec_st(cb, 0, outptr1);
+
+      /* Calculate Cr values */
+      cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+                                   (__vector unsigned int)cr0);
+      cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+                                   (__vector unsigned int)cr1);
+      cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+                                   (__vector unsigned int)cr2);
+      cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+                                   (__vector unsigned int)cr3);
+      crl = vec_perm((__vector unsigned short)cr0,
+                     (__vector unsigned short)cr1, shift_pack_index);
+      crh = vec_perm((__vector unsigned short)cr2,
+                     (__vector unsigned short)cr3, shift_pack_index);
+      cr = vec_pack(crl, crh);
+      vec_st(cr, 0, outptr2);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081  5329                 /* FIX(0.08131) */
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_168  11059                /* FIX(0.16874) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_331  21709                /* FIX(0.33126) */
+#define F_0_418  27439                /* FIX(0.41869) */
+#define F_0_500  32768                /* FIX(0.50000) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
new file mode 100644
index 0000000000..b280cbbded
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
@@ -0,0 +1,228 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+    rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = { 0 };
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh;
+  __vector int y0, y1, y2, y3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      { 0, 1, 4, 5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (num_cols > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (num_cols > 32)
+            rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = vec_ld(48, inptr);
+#endif
+        }
+      }
+#else
+      /* Little endian */
+      rgb0 = vec_vsx_ld(0, inptr);
+      if (num_cols > 16)
+        rgb1 = vec_vsx_ld(16, inptr);
+      if (num_cols > 32)
+        rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+      if (num_cols > 48)
+        rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c
new file mode 100644
index 0000000000..6e25b8db90
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c
@@ -0,0 +1,159 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+                                   int max_v_samp_factor,
+                                   JDIMENSION v_samp_factor,
+                                   JDIMENSION width_in_blocks,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data)
+{
+  int outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr, outptr;
+
+  __vector unsigned char this0, next0, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(0, 1) },
+    pw_one = { __8X(1) };
+  __vector unsigned char even_odd_index =
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      outl = vec_add(this0e, this0o);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_one);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        outh = vec_add(next0e, next0o);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_one);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+                              JDIMENSION v_samp_factor,
+                              JDIMENSION width_in_blocks,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+
+  __vector unsigned char this0, next0, this1, next1, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(1, 2) },
+    pw_two = { __8X(2) };
+  __vector unsigned char even_odd_index =
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr0);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      out0l = vec_add(this0e, this0o);
+
+      this1 = vec_ld(0, inptr1);
+      this1 = vec_perm(this1, this1, even_odd_index);
+      this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+      this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+      out1l = vec_add(this1e, this1o);
+
+      outl = vec_add(out0l, out1l);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_two);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr0);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        out0h = vec_add(next0e, next0o);
+
+        next1 = vec_ld(16, inptr1);
+        next1 = vec_perm(next1, next1, even_odd_index);
+        next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+        next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+        out1h = vec_add(next1e, next1o);
+
+        outh = vec_add(out0h, out1h);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_two);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
new file mode 100644
index 0000000000..68d52bd8a2
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
@@ -0,0 +1,276 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                   JDIMENSION input_row, JSAMPARRAY output_buf,
+                                   int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+  __vector int g0, g1, g2, g3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+         inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+      y = vec_ld(0, inptr0);
+      /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      yl = (__vector signed short)VEC_UNPACKHU(y);
+      yh = (__vector signed short)VEC_UNPACKLU(y);
+
+      cb = vec_ld(0, inptr1);
+      cbl = (__vector signed short)VEC_UNPACKHU(cb);
+      cbh = (__vector signed short)VEC_UNPACKLU(cb);
+      cbl = vec_sub(cbl, pw_cj);
+      cbh = vec_sub(cbh, pw_cj);
+
+      cr = vec_ld(0, inptr2);
+      crl = (__vector signed short)VEC_UNPACKHU(cr);
+      crh = (__vector signed short)VEC_UNPACKLU(cr);
+      crl = vec_sub(crl, pw_cj);
+      crh = vec_sub(crh, pw_cj);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+      bl = vec_add(cbl, cbl);
+      bh = vec_add(cbh, cbh);
+      bl = vec_madds(bl, pw_mf0228, pw_one);
+      bh = vec_madds(bh, pw_mf0228, pw_one);
+      bl = vec_sra(bl, (__vector unsigned short)pw_one);
+      bh = vec_sra(bh, (__vector unsigned short)pw_one);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, yl);
+      bh = vec_add(bh, yh);
+
+      rl = vec_add(crl, crl);
+      rh = vec_add(crh, crh);
+      rl = vec_madds(rl, pw_f0402, pw_one);
+      rh = vec_madds(rh, pw_f0402, pw_one);
+      rl = vec_sra(rl, (__vector unsigned short)pw_one);
+      rh = vec_sra(rh, (__vector unsigned short)pw_one);
+      rl = vec_add(rl, crl);
+      rh = vec_add(rh, crh);
+      rl = vec_add(rl, yl);
+      rh = vec_add(rh, yh);
+
+      g0w = vec_mergeh(cbl, crl);
+      g1w = vec_mergel(cbl, crl);
+      g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+      g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+      g2w = vec_mergeh(cbh, crh);
+      g3w = vec_mergel(cbh, crh);
+      g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+      g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+      gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+      gl = vec_sub(gl, crl);
+      gh = vec_sub(gh, crh);
+      gl = vec_add(gl, yl);
+      gh = vec_add(gh, yh);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
new file mode 100644
index 0000000000..40f02c33ea
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
@@ -0,0 +1,329 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+    rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+  __vector int g_y0, g_y1, g_y2, g_y3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+    even_index =
+      {  0, 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30 },
+    odd_index =
+      {  0, 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+    even_index =
+      { 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30,  0 },
+    odd_index =
+      { 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31,  0 };
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+    cb = vec_ld(0, inptr1);
+    /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+     * support unsigned vectors.
+     */
+    cbl = (__vector signed short)VEC_UNPACKHU(cb);
+    cbh = (__vector signed short)VEC_UNPACKLU(cb);
+    cbl = vec_sub(cbl, pw_cj);
+    cbh = vec_sub(cbh, pw_cj);
+
+    cr = vec_ld(0, inptr2);
+    crl = (__vector signed short)VEC_UNPACKHU(cr);
+    crh = (__vector signed short)VEC_UNPACKLU(cr);
+    crl = vec_sub(crl, pw_cj);
+    crh = vec_sub(crh, pw_cj);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+    b_yl = vec_add(cbl, cbl);
+    b_yh = vec_add(cbh, cbh);
+    b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+    b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+    b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+    b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+
+    r_yl = vec_add(crl, crl);
+    r_yh = vec_add(crh, crh);
+    r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+    r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+    r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+    r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+    r_yl = vec_add(r_yl, crl);
+    r_yh = vec_add(r_yh, crh);
+
+    g_y0w = vec_mergeh(cbl, crl);
+    g_y1w = vec_mergel(cbl, crl);
+    g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+    g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+    g_y2w = vec_mergeh(cbh, crh);
+    g_y3w = vec_mergel(cbh, crh);
+    g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+    g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+    /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+     * each dword into a new 16-bit vector, which is the equivalent of
+     * descaling the 32-bit results (right-shifting by 16 bits) and then
+     * packing them.
+     */
+    g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+                    shift_pack_index);
+    g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+                    shift_pack_index);
+    g_yl = vec_sub(g_yl, crl);
+    g_yh = vec_sub(g_yh, crh);
+
+    for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+         num_cols -= RGB_PIXELSIZE * 16,
+         outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+      y = vec_ld(0, inptr0);
+      ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+      yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+      if (yloop == 0) {
+        be = vec_add(b_yl, ye);
+        bo = vec_add(b_yl, yo);
+        re = vec_add(r_yl, ye);
+        ro = vec_add(r_yl, yo);
+        ge = vec_add(g_yl, ye);
+        go = vec_add(g_yl, yo);
+      } else {
+        be = vec_add(b_yh, ye);
+        bo = vec_add(b_yh, yo);
+        re = vec_add(r_yh, ye);
+        ro = vec_add(r_yh, yo);
+        ge = vec_add(g_yh, ye);
+        go = vec_add(g_yh, yo);
+      }
+
+      rl = vec_mergeh(re, ro);
+      rh = vec_mergel(re, ro);
+      gl = vec_mergeh(ge, go);
+      gh = vec_mergel(ge, go);
+      bl = vec_mergeh(be, bo);
+      bh = vec_mergel(be, bo);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
diff --git a/media/libjpeg/simd/powerpc/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c
new file mode 100644
index 0000000000..04df0cf108
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c
@@ -0,0 +1,400 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
+    out;
+  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+    next0l, next0h, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+    last_index_col0 =
+      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
+    last_index =
+      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+    next_index =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
+    next_index_lastcol =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
+#if __BIG_ENDIAN__
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    if (downsampled_width & 15)
+      inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+    this0 = vec_ld(0, inptr);
+    p_last0 = vec_perm(this0, this0, last_index_col0);
+    last0 = this0;
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr += 16, outptr += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_last0 = vec_perm(last0, this0, last_index);
+        last0 = this0;
+      }
+
+      if (incol <= 16)
+        p_next0 = vec_perm(this0, this0, next_index_lastcol);
+      else {
+        next0 = vec_ld(16, inptr);
+        p_next0 = vec_perm(this0, next0, next_index);
+      }
+
+      this0e = (__vector short)vec_mule(this0, pb_three);
+      this0o = (__vector short)vec_mulo(this0, pb_three);
+      this0l = vec_mergeh(this0e, this0o);
+      this0h = vec_mergel(this0e, this0o);
+
+      last0l = (__vector short)VEC_UNPACKHU(p_last0);
+      last0h = (__vector short)VEC_UNPACKLU(p_last0);
+      last0l = vec_add(last0l, pw_one);
+
+      next0l = (__vector short)VEC_UNPACKHU(p_next0);
+      next0h = (__vector short)VEC_UNPACKLU(p_next0);
+      next0l = vec_add(next0l, pw_two);
+
+      outle = vec_add(this0l, last0l);
+      outlo = vec_add(this0l, next0l);
+      outle = vec_sr(outle, (__vector unsigned short)pw_two);
+      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr);
+
+      if (incol > 8) {
+        last0h = vec_add(last0h, pw_one);
+        next0h = vec_add(next0h, pw_two);
+
+        outhe = vec_add(this0h, last0h);
+        outho = vec_add(this0h, next0h);
+        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+        outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr);
+      }
+
+      this0 = next0;
+    }
+  }
+}
+
+
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char this_1, this0, this1, out;
+  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+    lastcolsum_1h, lastcolsum1h,
+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+    tmpl, tmph, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) },
+    last_index_col0 =
+      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
+    last_index =
+      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+    next_index =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+    next_index_lastcol =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
+#if __BIG_ENDIAN__
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+  __vector unsigned short pw_four = { __8X(4) };
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 15) {
+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+    }
+
+    this0 = vec_ld(0, inptr0);
+    this0l = (__vector short)VEC_UNPACKHU(this0);
+    this0h = (__vector short)VEC_UNPACKLU(this0);
+    this0l = vec_mladd(this0l, pw_three, pw_zero);
+    this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+    this_1 = vec_ld(0, inptr_1);
+    this_1l = (__vector short)VEC_UNPACKHU(this_1);
+    this_1h = (__vector short)VEC_UNPACKLU(this_1);
+    thiscolsum_1l = vec_add(this0l, this_1l);
+    thiscolsum_1h = vec_add(this0h, this_1h);
+    lastcolsum_1h = thiscolsum_1h;
+    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+    this1 = vec_ld(0, inptr1);
+    this1l = (__vector short)VEC_UNPACKHU(this1);
+    this1h = (__vector short)VEC_UNPACKLU(this1);
+    thiscolsum1l = vec_add(this0l, this1l);
+    thiscolsum1h = vec_add(this0h, this1h);
+    lastcolsum1h = thiscolsum1h;
+    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+         outptr0 += 32, outptr1 += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
+      }
+
+      if (incol <= 16) {
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+                                   next_index_lastcol);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+                                  next_index_lastcol);
+      } else {
+        this0 = vec_ld(16, inptr0);
+        this0l = (__vector short)VEC_UNPACKHU(this0);
+        this0h = (__vector short)VEC_UNPACKLU(this0);
+        this0l = vec_mladd(this0l, pw_three, pw_zero);
+        this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+        this_1 = vec_ld(16, inptr_1);
+        this_1l = (__vector short)VEC_UNPACKHU(this_1);
+        this_1h = (__vector short)VEC_UNPACKLU(this_1);
+        nextcolsum_1l = vec_add(this0l, this_1l);
+        nextcolsum_1h = vec_add(this0h, this_1h);
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+        this1 = vec_ld(16, inptr1);
+        this1l = (__vector short)VEC_UNPACKHU(this1);
+        this1h = (__vector short)VEC_UNPACKLU(this1);
+        nextcolsum1l = vec_add(this0l, this1l);
+        nextcolsum1h = vec_add(this0h, this1h);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+      }
+
+      /* Process the upper row */
+
+      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum_1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum_1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr0);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum_1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum_1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr0);
+      }
+
+      /* Process the lower row */
+
+      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr1);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr1);
+      }
+
+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
+    }
+  }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr);
+      vec_st(inh, 16, outptr);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr);
+        vec_st(inh, 48, outptr);
+      }
+    }
+  }
+}
+
+
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr0);
+      vec_st(inl, 0, outptr1);
+
+      vec_st(inh, 16, outptr0);
+      vec_st(inh, 16, outptr1);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr0);
+        vec_st(inl, 32, outptr1);
+
+        vec_st(inh, 48, outptr0);
+        vec_st(inh, 48, outptr1);
+      }
+    }
+  }
+}
diff --git a/media/libjpeg/simd/powerpc/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
new file mode 100644
index 0000000000..ad9af81e0c
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
@@ -0,0 +1,154 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382  98   /* FIX(0.382683433) */
+#define F_0_541  139  /* FIX(0.541196100) */
+#define F_0_707  181  /* FIX(0.707106781) */
+#define F_1_306  334  /* FIX(1.306562965) */
+
+#define CONST_BITS  8
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out4  = vec_sub(tmp10, tmp11); \
+  \
+  z1 = vec_add(tmp12, tmp13); \
+  z1 = vec_sl(z1, pre_multiply_scale_bits); \
+  z1 = vec_madds(z1, pw_0707, pw_zero); \
+  \
+  out2 = vec_add(tmp13, z1); \
+  out6 = vec_sub(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = vec_add(tmp4, tmp5); \
+  tmp11 = vec_add(tmp5, tmp6); \
+  tmp12 = vec_add(tmp6, tmp7); \
+  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  z5 = vec_sub(tmp10, tmp12); \
+  z5 = vec_madds(z5, pw_0382, pw_zero); \
+  \
+  z2 = vec_madds(tmp10, pw_0541, z5); \
+  z4 = vec_madds(tmp12, pw_1306, z5); \
+  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero); \
+  \
+  z11 = vec_add(tmp7, z3); \
+  z13 = vec_sub(tmp7, z3); \
+  \
+  out5 = vec_add(z13, z2); \
+  out3 = vec_sub(z13, z2); \
+  out1 = vec_add(z11, z4); \
+  out7 = vec_sub(z11, z4); \
+}
+
+
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12); \
+  tmp1312h = vec_mergel(tmp13, tmp12); \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(tmp4, tmp6); \
+  z4 = vec_add(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7); \
+  tmp47h = vec_mergel(tmp4, tmp7); \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  \
+  out7 = vec_pack(out7l, out7h); \
+  out1 = vec_pack(out1l, out1h); \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6); \
+  tmp56h = vec_mergel(tmp5, tmp6); \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  \
+  out5 = vec_pack(out5l, out5h); \
+  out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_sl(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_sl(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_add(out0, pw_descale_p2x); \
+  out0  = vec_sra(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_add(out4, pw_descale_p2x); \
+  out4  = vec_sra(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/powerpc/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
new file mode 100644
index 0000000000..456c6c6174
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
@@ -0,0 +1,255 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082  277              /* FIX(1.082392200) */
+#define F_1_414  362              /* FIX(1.414213562) */
+#define F_1_847  473              /* FIX(1.847759065) */
+#define F_2_613  669              /* FIX(2.613125930) */
+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in) { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(in##0, in##4); \
+  tmp11 = vec_sub(in##0, in##4); \
+  tmp13 = vec_add(in##2, in##6); \
+  \
+  tmp12 = vec_sub(in##2, in##6); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+  tmp12 = vec_sub(tmp12, tmp13); \
+  \
+  tmp0 = vec_add(tmp10, tmp13); \
+  tmp3 = vec_sub(tmp10, tmp13); \
+  tmp1 = vec_add(tmp11, tmp12); \
+  tmp2 = vec_sub(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = vec_add(in##5, in##3); \
+  z10 = vec_sub(in##5, in##3); \
+  z10s = vec_sl(z10, pre_multiply_scale_bits); \
+  z11 = vec_add(in##1, in##7); \
+  z12s = vec_sub(in##1, in##7); \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+  \
+  tmp11 = vec_sub(z11, z13); \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
+  \
+  tmp7 = vec_add(z11, z13); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = vec_add(z10s, z12s); \
+  z5 = vec_madds(z5, pw_F1847, pw_zero); \
+  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+  tmp10 = vec_sub(tmp10, z5); \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+  tmp12 = vec_sub(tmp12, z10); \
+  \
+  tmp6 = vec_sub(tmp12, tmp7); \
+  tmp5 = vec_sub(tmp11, tmp6); \
+  tmp4 = vec_add(tmp10, tmp5); \
+  \
+  out0 = vec_add(tmp0, tmp7); \
+  out1 = vec_add(tmp1, tmp6); \
+  out2 = vec_add(tmp2, tmp5); \
+  out3 = vec_sub(tmp3, tmp4); \
+  out4 = vec_add(tmp3, tmp4); \
+  out5 = vec_sub(tmp2, tmp5); \
+  out6 = vec_sub(tmp1, tmp6); \
+  out7 = vec_sub(tmp0, tmp7); \
+}
+
+
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+    pass1_bits3 = { __8X(PASS1_BITS + 3) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row);
+
+  out0 = vec_sra(out0, pass1_bits3);
+  out1 = vec_sra(out1, pass1_bits3);
+  out2 = vec_sra(out2, pass1_bits3);
+  out3 = vec_sra(out3, pass1_bits3);
+  out4 = vec_sra(out4, pass1_bits3);
+  out5 = vec_sra(out5, pass1_bits3);
+  out6 = vec_sra(out6, pass1_bits3);
+  out7 = vec_sra(out7, pass1_bits3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/media/libjpeg/simd/powerpc/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c
new file mode 100644
index 0000000000..60e619f11d
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c
@@ -0,0 +1,357 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  in##26l = vec_mergeh(in##2, in##6); \
+  in##26h = vec_mergel(in##2, in##6); \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+  \
+  tmp0 = vec_add(in##0, in##4); \
+  tmp1 = vec_sub(in##0, in##4); \
+  \
+  tmp0l = vec_unpackh(tmp0); \
+  tmp0h = vec_unpackl(tmp0); \
+  tmp0l = vec_sl(tmp0l, const_bits); \
+  tmp0h = vec_sl(tmp0h, const_bits); \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l); \
+  tmp10h = vec_add(tmp0h, tmp3h); \
+  tmp13l = vec_sub(tmp0l, tmp3l); \
+  tmp13h = vec_sub(tmp0h, tmp3h); \
+  \
+  tmp1l = vec_unpackh(tmp1); \
+  tmp1h = vec_unpackl(tmp1); \
+  tmp1l = vec_sl(tmp1l, const_bits); \
+  tmp1h = vec_sl(tmp1h, const_bits); \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l); \
+  tmp11h = vec_add(tmp1h, tmp2h); \
+  tmp12l = vec_sub(tmp1l, tmp2l); \
+  tmp12h = vec_sub(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(in##3, in##7); \
+  z4 = vec_add(in##1, in##5); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  in##71l = vec_mergeh(in##7, in##1); \
+  in##71h = vec_mergel(in##7, in##1); \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+  \
+  in##53l = vec_mergeh(in##5, in##3); \
+  in##53h = vec_mergel(in##5, in##3); \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = vec_add(tmp10l, tmp3l); \
+  out0h = vec_add(tmp10h, tmp3h); \
+  out7l = vec_sub(tmp10l, tmp3l); \
+  out7h = vec_sub(tmp10h, tmp3h); \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS); \
+  out0h = vec_sra(out0h, descale_p##PASS); \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  \
+  out0 = vec_pack(out0l, out0h); \
+  out7 = vec_pack(out7l, out7h); \
+  \
+  out1l = vec_add(tmp11l, tmp2l); \
+  out1h = vec_add(tmp11h, tmp2h); \
+  out6l = vec_sub(tmp11l, tmp2l); \
+  out6h = vec_sub(tmp11h, tmp2h); \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out1 = vec_pack(out1l, out1h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  out2l = vec_add(tmp12l, tmp1l); \
+  out2h = vec_add(tmp12h, tmp1h); \
+  out5l = vec_sub(tmp12l, tmp1l); \
+  out5h = vec_sub(tmp12h, tmp1h); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out5 = vec_pack(out5l, out5h); \
+  \
+  out3l = vec_add(tmp13l, tmp0l); \
+  out3h = vec_add(tmp13h, tmp0h); \
+  out4l = vec_sub(tmp13l, tmp0l); \
+  out4h = vec_sub(tmp13h, tmp0h); \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  out4l = vec_sra(out4l, descale_p##PASS); \
+  out4h = vec_sra(out4h, descale_p##PASS); \
+  \
+  out3 = vec_pack(out3l, out3h); \
+  out4 = vec_pack(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_zero = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/media/libjpeg/simd/powerpc/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c
new file mode 100644
index 0000000000..7d6e32542b
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c
@@ -0,0 +1,250 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_ld(0, elemptr); \
+  if ((size_t)elemptr & 15) \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
+}
+
+#else
+
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_vsx_ld(0, elemptr); \
+}
+
+#endif
+
+
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace)
+{
+  JSAMPROW elemptr;
+
+  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+  __vector unsigned char pb_zero = { __16X(0) };
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = (__vector short)VEC_UNPACKHU(in0);
+  out1 = (__vector short)VEC_UNPACKHU(in1);
+  out2 = (__vector short)VEC_UNPACKHU(in2);
+  out3 = (__vector short)VEC_UNPACKHU(in3);
+  out4 = (__vector short)VEC_UNPACKHU(in4);
+  out5 = (__vector short)VEC_UNPACKHU(in5);
+  out6 = (__vector short)VEC_UNPACKHU(in6);
+  out7 = (__vector short)VEC_UNPACKHU(in7);
+
+  out0 = vec_sub(out0, pw_centerjsamp);
+  out1 = vec_sub(out1, pw_centerjsamp);
+  out2 = vec_sub(out2, pw_centerjsamp);
+  out3 = vec_sub(out3, pw_centerjsamp);
+  out4 = vec_sub(out4, pw_centerjsamp);
+  out5 = vec_sub(out5, pw_centerjsamp);
+  out6 = vec_sub(out6, pw_centerjsamp);
+  out7 = vec_sub(out7, pw_centerjsamp);
+
+  vec_st(out0, 0, workspace);
+  vec_st(out1, 16, workspace);
+  vec_st(out2, 32, workspace);
+  vec_st(out3, 48, workspace);
+  vec_st(out4, 64, workspace);
+  vec_st(out5, 80, workspace);
+  vec_st(out6, 96, workspace);
+  vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT  16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+   We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) { \
+  tmpe = vec_mule((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  tmpo = vec_mulo((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+                                 (__vector unsigned short)tmpo, \
+                                 shift_pack_index); \
+}
+
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+  __vector unsigned int tmpe, tmpo;
+
+  /* Constants */
+  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+  __vector unsigned char shift_pack_index =
+    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
+#else
+  __vector unsigned char shift_pack_index =
+    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
+#endif
+
+  row0 = vec_ld(0, workspace);
+  row1 = vec_ld(16, workspace);
+  row2 = vec_ld(32, workspace);
+  row3 = vec_ld(48, workspace);
+  row4 = vec_ld(64, workspace);
+  row5 = vec_ld(80, workspace);
+  row6 = vec_ld(96, workspace);
+  row7 = vec_ld(112, workspace);
+
+  /* Branch-less absolute value */
+  row0s = vec_sra(row0, pw_word_bit_m1);
+  row1s = vec_sra(row1, pw_word_bit_m1);
+  row2s = vec_sra(row2, pw_word_bit_m1);
+  row3s = vec_sra(row3, pw_word_bit_m1);
+  row4s = vec_sra(row4, pw_word_bit_m1);
+  row5s = vec_sra(row5, pw_word_bit_m1);
+  row6s = vec_sra(row6, pw_word_bit_m1);
+  row7s = vec_sra(row7, pw_word_bit_m1);
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+  row0 = vec_add(row0, corr0);
+  row1 = vec_add(row1, corr1);
+  row2 = vec_add(row2, corr2);
+  row3 = vec_add(row3, corr3);
+  row4 = vec_add(row4, corr4);
+  row5 = vec_add(row5, corr5);
+  row6 = vec_add(row6, corr6);
+  row7 = vec_add(row7, corr7);
+
+  recip0 = vec_ld(0, divisors);
+  recip1 = vec_ld(16, divisors);
+  recip2 = vec_ld(32, divisors);
+  recip3 = vec_ld(48, divisors);
+  recip4 = vec_ld(64, divisors);
+  recip5 = vec_ld(80, divisors);
+  recip6 = vec_ld(96, divisors);
+  recip7 = vec_ld(112, divisors);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  vec_st(row0, 0, coef_block);
+  vec_st(row1, 16, coef_block);
+  vec_st(row2, 32, coef_block);
+  vec_st(row3, 48, coef_block);
+  vec_st(row4, 64, coef_block);
+  vec_st(row5, 80, coef_block);
+  vec_st(row6, 96, coef_block);
+  vec_st(row7, 112, coef_block);
+}
diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..461f603633
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd.c
@@ -0,0 +1,884 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+  uint32 altivec = 0;
+#elif defined(__APPLE__)
+  int mib[2] = { CTL_HW, HW_VECTORUNIT };
+  int altivec;
+  size_t len = sizeof(altivec);
+#elif defined(__OpenBSD__)
+  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+  int altivec;
+  size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+  unsigned long cpufeatures = 0;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ALTIVEC__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__amigaos4__)
+  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+  if (altivec == VECTORTYPE_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__APPLE__) || defined(__OpenBSD__)
+  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_ycc_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_ycc_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_gray_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_gray_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_gray_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_gray_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_ycc_extrgb_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_ycc_extbgr_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_ycc_rgb_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v2_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v1_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/powerpc/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h
new file mode 100644
index 0000000000..e8bdb06a54
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h
@@ -0,0 +1,98 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a)      a, a, a, a
+#define __4X2(a, b)  a, b, a, b, a, b, a, b
+#define __8X(a)      __4X(a), __4X(a)
+#define __16X(a)     __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col) { \
+  __vector short row04l, row04h, row15l, row15h, \
+                 row26l, row26h, row37l, row37h; \
+  __vector short col01e, col01o, col23e, col23o, \
+                 col45e, col45o, col67e, col67o; \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+}
+
+#ifndef min
+#define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b)     vec_ld(a, b)
+#define VEC_ST(a, b, c)  vec_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a)  vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b)     vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c)  vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a)  vec_mergel(a, pb_zero)
+
+#endif
diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
+    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
+    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+    alignz      32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+    alignz      32
+
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
+
+%endif
+
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+%else
+    pop         r12
+    pop         rbp
+    pop         rbx
+%endif
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    mov         T1d, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, ZERO
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         rcx, 16
+    shl         rdx, 32
+    shl         rsi, 48
+
+    or          rax, rcx
+    or          rdx, rsi
+    or          rax, rdx
+
+    not         rax
+
+    mov         MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO    xmm9
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO    xmm9
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define KK      r9d
+%define EOB     r8d
+%define SIGN    rdi
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    xor         SIGN, SIGN
+    xor         EOB, EOB
+    xor         KK, KK
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 16
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    test        LEN, 8
+    jz          .TRYR7
+    test        LEN, 7
+    jz          .TRYR8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    shr         SIGN, 8
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    not         SIGN
+    sub         VALUES, DCTSIZE2*2
+    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+    REDUCE0
+
+    mov         eax, EOB
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    ; rcx can possibly be 8, 16, 24
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         short .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [rdx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3             ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6             ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7             ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3             ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6             ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7             ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4        ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5        ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [rsi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [rel PW_MF0228]   ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [rel PW_F0402]    ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [rel PW_ONE]
+    paddw       xmm1, [rel PW_ONE]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm4, [rel PW_MF0344_F0285]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [rel PW_MF0344_F0285]
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm4, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [rel PD_ONEHALF]
+    paddd       xmm5, [rel PD_ONEHALF]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [rsi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    vmovdqu     ymm6, YMMWORD [rbx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [rdx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [rel PW_MF0228]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [rel PW_F0402]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [rel PW_ONE]
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpsraw      ymm6, ymm6, 1                ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [rel PW_ONE]
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpsraw      ymm7, ymm7, 1                ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5             ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1             ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3             ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6        ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7        ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm6, ymm6, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm7, ymm7, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpaddd      ymm6, ymm6, [rel PD_ONEHALF]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm7, ymm7, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [rsi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    movdqa      xmm6, XMMWORD [rbx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [rdx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [rel PW_MF0228]   ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [rel PW_F0402]    ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [rel PW_ONE]
+    paddw       xmm4, [rel PW_ONE]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [rel PW_ONE]
+    paddw       xmm0, [rel PW_ONE]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+    pmaddwd     xmm6, [rel PW_MF0344_F0285]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm7, [rel PW_MF0344_F0285]
+
+    paddd       xmm5, [rel PD_ONEHALF]
+    paddd       xmm6, [rel PD_ONEHALF]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm7, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    push_xmm    3
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+
+    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpmullw     ymm1, ymm1, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm2, ymm2, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpaddw      ymm3, ymm3, [rel PW_TWO]
+    vpaddw      ymm6, ymm6, [rel PW_TWO]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    push_xmm    3
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm8, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm8, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm0, ymm8, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm8, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm8, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm8, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [rel PW_THREE]
+    vpmullw     ymm3, ymm3, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
+    vpaddw      ymm2, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm7, ymm8, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm8, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm8, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm8, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
+    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
+    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD    ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr
+    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm2, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    paddw       xmm3, [rel PW_TWO]
+    paddw       xmm6, [rel PW_TWO]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [rel PW_THREE]
+    pmullw      xmm3, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm5, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_SEVEN]
+    paddw       xmm2, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_EIGHT]
+    paddw       xmm7, [rel PW_SEVEN]
+    paddw       xmm5, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, byte 4*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .columnloop
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [rel PW_F0707]    ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z3
+
+    movdqa      xmm4, xmm2              ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [rel PW_F0382]    ; xmm2=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [rel PW_F1306]    ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2              ; xmm4=z2
+    paddw       xmm0, xmm2              ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [rel PW_F0707]    ; xmm0=z3
+
+    movdqa      xmm4, xmm7              ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [rel PW_F0382]    ; xmm7=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [rel PW_F1306]    ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7              ; xmm4=z2
+    paddw       xmm1, xmm7              ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [rel PW_DESCALE_P2X]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [rel PW_F130_F054_MF130_F054]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %6, %6, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [rel PW_MF078_F117_F078_F117]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [rel PW_MF060_MF089_MF050_MF256]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [rel PW_F050_MF256_F060_MF089]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [rel PW_F050_MF256_F060_MF089]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %5, %5, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+    uncollect_args 1
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [rel PW_F130_F054]   ; xmm7=data2L
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=data2H
+    pmaddwd     xmm4, [rel PW_F054_MF130]  ; xmm4=data6L
+    pmaddwd     xmm0, [rel PW_F054_MF130]  ; xmm0=data6H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm6, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm0, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3L
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3H
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4L
+    pmaddwd     xmm0, [rel PW_F117_F078]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [rel PW_MF089_F060]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [rel PW_MF050_MF256]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [rel PW_MF256_F050]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [rel PD_DESCALE_P1]
+    paddd       xmm3, [rel PD_DESCALE_P1]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [rel PW_DESCALE_P2X]
+    paddw       xmm5, [rel PW_DESCALE_P2X]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=data2L
+    pmaddwd     xmm2, [rel PW_F130_F054]   ; xmm2=data2H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=data6L
+    pmaddwd     xmm6, [rel PW_F054_MF130]  ; xmm6=data6H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm2, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm6, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3L
+    pmaddwd     xmm1, [rel PW_MF078_F117]  ; xmm1=z3H
+    pmaddwd     xmm2, [rel PW_F117_F078]   ; xmm2=z4L
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [rel PW_MF060_MF089]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [rel PW_MF089_F060]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [rel PW_MF089_F060]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [rel PW_MF050_MF256]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [rel PW_MF256_F050]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [rel PD_DESCALE_P2]
+    paddd       xmm7, [rel PD_DESCALE_P2]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [workspace]
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rcx, DCTSIZE/4          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         rcx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+    mov         rcx, DCTSIZE/4          ; ctr
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
+    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         rdi, byte 4*SIZEOF_JSAMPROW
+    dec         rcx                            ; ctr
+    jnz         near .rowloop
+
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [rel PW_F1414]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [rel PW_F1414]    ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [rel PW_F1847]    ; xmm5=z5
+    pmulhw      xmm0, [rel PW_MF1613]
+    pmulhw      xmm2, [rel PW_F1082]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F1414]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [rel PW_F1414]    ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [rel PW_F1847]    ; xmm4=z5
+    pmulhw      xmm6, [rel PW_MF1613]
+    pmulhw      xmm0, [rel PW_F1082]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [rel PW_F130_F054_MF130_F054]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [rel PW_1_NEG1]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %11, %8, %5             ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %9, %8, %5              ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %12, %1, %6             ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %10, %1, %6             ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [rel PW_MF078_F117_F078_F117]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [rel PW_MF060_MF089_MF050_MF256]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [rel PW_MF089_F060_MF256_F050]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [rel PW_MF089_F060_MF256_F050]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vpaddd      %1, %9, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %10, %8             ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %9, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %10, %8             ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vpaddd      %7, %11, %5             ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %8, %12, %6             ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %2, %7, %8              ; %2=data3_2
+
+    vpsubd      %7, %11, %5             ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %8, %12, %6             ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %3, %7, %8              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    push_xmm    4
+    collect_args 4
+
+    ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [rel PB_CENTERJSAMP]
+    vpaddb      ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         eax, r13d
+
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    uncollect_args 4
+    pop_xmm     4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [rel PW_F256_F089]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [rel PW_F256_F089]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [rel PW_F106_MF217]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [rel PW_F106_MF217]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [rel PW_MF060_MF050]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [rel PW_F145_MF021]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [rel PW_F184_MF076]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [rel PW_F184_MF076]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [rel PD_DESCALE_P1_4]  ; xmm6=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1_4]  ; xmm7=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [rel PW_F256_F089]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [rel PW_F106_MF217]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [rel PW_F184_MF076]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [rel PD_DESCALE_P2_4]  ; xmm1=[rel PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [rel PB_CENTERJSAMP]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [rel PW_F362_MF127]
+    pmaddwd     xmm5, [rel PW_F085_MF072]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [rel PW_F362_MF127]
+    pmaddwd     xmm2, [rel PW_F085_MF072]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [rel PD_DESCALE_P1_2]  ; xmm2=[rel PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [rel PW_F362_MF127]
+    pmaddwd     xmm7, [rel PW_F085_MF072]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [rel PD_DESCALE_P2_2]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [rel PB_CENTERJSAMP]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/2
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW
+    add         rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/16
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+    add         rsi, byte 16*SIZEOF_FAST_FLOAT
+    add         rdx, byte 16*SIZEOF_FAST_FLOAT
+    add         rdi, byte 16*SIZEOF_JCOEF
+    dec         rax
+    jnz         short .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         eax, r11d
+
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm3, ymm3, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/4
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 4*SIZEOF_JSAMPROW
+    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/32
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 32*SIZEOF_DCTELEM
+    add         rdx, byte 32*SIZEOF_DCTELEM
+    add         rdi, byte 32*SIZEOF_JCOEF
+    dec         rax
+    jnz         near .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..3f5ee77eb9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimd.c
@@ -0,0 +1,1110 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  if (simd_support == ~0U)
+    init_simd();
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else
+    jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support == ~0U)
+    init_simd();
+
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        rbx
+    push        rdi
+
+    xor         rdi, rdi                ; simd support flag
+
+    ; Assume that all x86-64 processors support SSE & SSE2 instructions
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    mov         rax, 0
+    cpuid
+    cmp         rax, 7
+    jl          short .return           ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         rax, 7
+    xor         rcx, rcx
+    cpuid
+    mov         rax, rbx                ; rax = Extended feature flags
+
+    test        rax, 1<<5               ; bit5:AVX2
+    jz          short .return
+
+    ; Check for AVX2 O/S support
+    mov         rax, 1
+    xor         rcx, rcx
+    cpuid
+    test        rcx, 1<<27
+    jz          short .return           ; O/S does not support XSAVE
+    test        rcx, 1<<28
+    jz          short .return           ; CPU does not support AVX2
+
+    xor         rcx, rcx
+    xgetbv
+    and         rax, 6
+    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .return
+
+    or          rdi, JSIMD_AVX2
+
+.return:
+    mov         rax, rdi
+
+    pop         rdi
+    pop         rbx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /media/libjpeg/simd
parent	Initial commit. (diff)
download	firefox-esr-upstream.tar.xz firefox-esr-upstream.zip